http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/MultipleFieldsDocument.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/MultipleFieldsDocument.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/MultipleFieldsDocument.java new file mode 100644 index 0000000..7483b2d --- /dev/null +++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/MultipleFieldsDocument.java @@ -0,0 +1,58 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.mahout.text.doc; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.TextField; + +/** + * Used for testing lucene2seq + */ +@Deprecated +public class MultipleFieldsDocument extends SingleFieldDocument { + + public static final String FIELD1 = "field1"; + public static final String FIELD2 = "field2"; + + private String field1; + private String field2; + + public MultipleFieldsDocument(String id, String field, String field1, String field2) { + super(id, field); + this.field1 = field1; + this.field2 = field2; + } + + public String getField1() { + return field1; + } + + public String getField2() { + return field2; + } + + @Override + public Document asLuceneDocument() { + Document document = super.asLuceneDocument(); + + document.add(new TextField(FIELD1, this.field1, Field.Store.YES)); + document.add(new TextField(FIELD2, this.field2, Field.Store.YES)); + + return document; + } +}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/NumericFieldDocument.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/NumericFieldDocument.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/NumericFieldDocument.java new file mode 100644 index 0000000..e06e8d6 --- /dev/null +++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/NumericFieldDocument.java @@ -0,0 +1,54 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.mahout.text.doc; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.IntField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; + +/** + * Document with numeric field. + */ +@Deprecated +public class NumericFieldDocument extends SingleFieldDocument { + + public static final String NUMERIC_FIELD = "numeric"; + + private int numericField; + + public NumericFieldDocument(String id, String field, int numericField) { + super(id, field); + this.numericField = numericField; + } + + @Override + public Document asLuceneDocument() { + Document document = new Document(); + + document.add(new StringField(ID_FIELD, getId(), Field.Store.YES)); + document.add(new TextField(FIELD, getField(), Field.Store.YES)); + document.add(new IntField(NUMERIC_FIELD, numericField, Field.Store.YES)); + + return document; + } + + public int getNumericField() { + return numericField; + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/SingleFieldDocument.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/SingleFieldDocument.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/SingleFieldDocument.java new file mode 100644 index 0000000..4636a51 --- /dev/null +++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/SingleFieldDocument.java @@ -0,0 +1,63 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.mahout.text.doc; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; + +/** + * Used for testing lucene2seq + */ +@Deprecated +public class SingleFieldDocument implements TestDocument { + + public static final String ID_FIELD = "idField"; + public static final String FIELD = "field"; + + private String id; + private String field; + + public SingleFieldDocument(String id, String field) { + this.id = id; + this.field = field; + } + + @Override + public String getId() { + return id; + } + + @Override + public String getField() { + return field; + } + + @Override + public Document asLuceneDocument() { + Document document = new Document(); + + Field idField = new StringField(ID_FIELD, getId(), Field.Store.YES); + Field field = new TextField(FIELD, getField(), Field.Store.YES); + + document.add(idField); + document.add(field); + + return document; + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/TestDocument.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/TestDocument.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/TestDocument.java new file mode 100644 index 0000000..7243c71 --- /dev/null +++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/TestDocument.java @@ -0,0 +1,29 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.mahout.text.doc; + +import org.apache.lucene.document.Document; +@Deprecated +public interface TestDocument { + + String getId(); + + String getField(); + + Document asLuceneDocument(); + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/UnstoredFieldsDocument.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/UnstoredFieldsDocument.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/UnstoredFieldsDocument.java new file mode 100644 index 0000000..6eb43f6 --- /dev/null +++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/text/doc/UnstoredFieldsDocument.java @@ -0,0 +1,43 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.mahout.text.doc; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.StringField; + +/** + * Used for testing lucene2seq + */ +@Deprecated +public class UnstoredFieldsDocument extends SingleFieldDocument { + + public static final String UNSTORED_FIELD = "unstored"; + + public UnstoredFieldsDocument(String id, String field) { + super(id, field); + } + + @Override + public Document asLuceneDocument() { + Document document = super.asLuceneDocument(); + + document.add(new StringField(UNSTORED_FIELD, "", Field.Store.NO)); + + return document; + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/Bump125Test.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/Bump125Test.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/Bump125Test.java new file mode 100644 index 0000000..65b308f --- /dev/null +++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/Bump125Test.java @@ -0,0 +1,42 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.utils; + +import com.google.common.collect.Lists; + +import org.apache.mahout.common.MahoutTestCase; +import org.junit.Test; + +import java.util.Iterator; + +public class Bump125Test extends MahoutTestCase { + @Test + public void testIncrement() throws Exception { + Iterator<Integer> ref = Lists.newArrayList(1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 12, 14, 16, 18, 20, 25, 30, 35, 40, 50, 60, + 70, 80, 100, 120, 140, 160, 180, 200, 250, 300, 350, + 400, 500, 600, 700, 800, 1000, 1200, 1400, 1600, 1800, + 2000, 2500, 3000, 3500, 4000, 5000, 6000, 7000) + .iterator(); + Bump125 b = new Bump125(); + for (int i = 0; i < 50; i++) { + long x = b.increment(); + assertEquals(ref.next().longValue(), x); + } + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/SplitInputTest.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/SplitInputTest.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/SplitInputTest.java new file mode 100644 index 0000000..7ffa690 --- /dev/null +++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/SplitInputTest.java @@ -0,0 +1,418 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.utils; + +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.Writer; +import java.nio.charset.Charset; + +import com.google.common.io.Closeables; +import org.apache.commons.io.Charsets; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.util.ToolRunner; +import org.apache.mahout.classifier.ClassifierData; +import org.apache.mahout.common.MahoutTestCase; +import org.apache.mahout.common.Pair; +import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable; +import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterable; +import org.apache.mahout.math.SequentialAccessSparseVector; +import org.apache.mahout.math.Vector; +import org.apache.mahout.math.VectorWritable; +import org.apache.mahout.math.map.OpenObjectIntHashMap; +import org.junit.Before; +import org.junit.Test; + +public final class SplitInputTest extends MahoutTestCase { + + private OpenObjectIntHashMap<String> countMap; + private Charset charset; + private FileSystem fs; + private Path tempInputFile; + private Path tempTrainingDirectory; + private Path tempTestDirectory; + private Path tempMapRedOutputDirectory; + private Path tempInputDirectory; + private Path tempSequenceDirectory; + private SplitInput si; + + @Override + @Before + public void setUp() throws Exception { + Configuration conf = getConfiguration(); + fs = FileSystem.get(conf); + + super.setUp(); + + countMap = new OpenObjectIntHashMap<>(); + + charset = Charsets.UTF_8; + tempSequenceDirectory = getTestTempFilePath("tmpsequence"); + tempInputFile = getTestTempFilePath("bayesinputfile"); + tempTrainingDirectory = getTestTempDirPath("bayestrain"); + tempTestDirectory = getTestTempDirPath("bayestest"); + tempMapRedOutputDirectory = new Path(getTestTempDirPath(), "mapRedOutput"); + tempInputDirectory = getTestTempDirPath("bayesinputdir"); + + si = new SplitInput(); + si.setTrainingOutputDirectory(tempTrainingDirectory); + si.setTestOutputDirectory(tempTestDirectory); + si.setInputDirectory(tempInputDirectory); + } + + private void writeMultipleInputFiles() throws IOException { + Writer writer = null; + String currentLabel = null; + try { + for (String[] entry : ClassifierData.DATA) { + if (!entry[0].equals(currentLabel)) { + currentLabel = entry[0]; + Closeables.close(writer, false); + + writer = new BufferedWriter(new OutputStreamWriter(fs.create(new Path(tempInputDirectory, currentLabel)), + Charsets.UTF_8)); + } + countMap.adjustOrPutValue(currentLabel, 1, 1); + writer.write(currentLabel + '\t' + entry[1] + '\n'); + } + }finally { + Closeables.close(writer, false); + } + } + + private void writeSingleInputFile() throws IOException { + Writer writer = new BufferedWriter(new OutputStreamWriter(fs.create(tempInputFile), Charsets.UTF_8)); + try { + for (String[] entry : ClassifierData.DATA) { + writer.write(entry[0] + '\t' + entry[1] + '\n'); + } + } finally { + Closeables.close(writer, true); + } + } + + @Test + public void testSplitDirectory() throws Exception { + + writeMultipleInputFiles(); + + final int testSplitSize = 1; + si.setTestSplitSize(testSplitSize); + si.setCallback(new SplitInput.SplitCallback() { + @Override + public void splitComplete(Path inputFile, int lineCount, int trainCount, int testCount, int testSplitStart) { + int trainingLines = countMap.get(inputFile.getName()) - testSplitSize; + assertSplit(fs, inputFile, charset, testSplitSize, trainingLines, tempTrainingDirectory, tempTestDirectory); + } + }); + + si.splitDirectory(tempInputDirectory); + } + + @Test + public void testSplitFile() throws Exception { + writeSingleInputFile(); + si.setTestSplitSize(2); + si.setCallback(new TestCallback(2, 10)); + si.splitFile(tempInputFile); + } + + @Test + public void testSplitFileLocation() throws Exception { + writeSingleInputFile(); + si.setTestSplitSize(2); + si.setSplitLocation(50); + si.setCallback(new TestCallback(2, 10)); + si.splitFile(tempInputFile); + } + + @Test + public void testSplitFilePct() throws Exception { + writeSingleInputFile(); + si.setTestSplitPct(25); + + si.setCallback(new TestCallback(3, 9)); + si.splitFile(tempInputFile); + } + + @Test + public void testSplitFilePctLocation() throws Exception { + writeSingleInputFile(); + si.setTestSplitPct(25); + si.setSplitLocation(50); + si.setCallback(new TestCallback(3, 9)); + si.splitFile(tempInputFile); + } + + @Test + public void testSplitFileRandomSelectionSize() throws Exception { + writeSingleInputFile(); + si.setTestRandomSelectionSize(5); + + si.setCallback(new TestCallback(5, 7)); + si.splitFile(tempInputFile); + } + + @Test + public void testSplitFileRandomSelectionPct() throws Exception { + writeSingleInputFile(); + si.setTestRandomSelectionPct(25); + + si.setCallback(new TestCallback(3, 9)); + si.splitFile(tempInputFile); + } + + /** + * Create a Sequencefile for testing consisting of IntWritable + * keys and VectorWritable values + * @param path path for test SequenceFile + * @param testPoints number of records in test SequenceFile + */ + private void writeVectorSequenceFile(Path path, int testPoints) throws IOException { + Path tempSequenceFile = new Path(path, "part-00000"); + Configuration conf = getConfiguration(); + IntWritable key = new IntWritable(); + VectorWritable value = new VectorWritable(); + try (SequenceFile.Writer writer = + SequenceFile.createWriter(fs, conf, tempSequenceFile, IntWritable.class, VectorWritable.class)) { + for (int i = 0; i < testPoints; i++) { + key.set(i); + Vector v = new SequentialAccessSparseVector(4); + v.assign(i); + value.set(v); + writer.append(key, value); + } + } + } + + /** + * Create a Sequencefile for testing consisting of IntWritable keys and Text values + * @param path path for test SequenceFile + * @param testPoints number of records in test SequenceFile + */ + private void writeTextSequenceFile(Path path, int testPoints) throws IOException { + Path tempSequenceFile = new Path(path, "part-00000"); + Configuration conf = getConfiguration(); + Text key = new Text(); + Text value = new Text(); + try (SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, tempSequenceFile, Text.class, Text.class)){ + for (int i = 0; i < testPoints; i++) { + key.set(Integer.toString(i)); + value.set("Line " + i); + writer.append(key, value); + } + } + } + + /** + * Display contents of a SequenceFile + * @param sequenceFilePath path to SequenceFile + */ + private void displaySequenceFile(Path sequenceFilePath) throws IOException { + for (Pair<?,?> record : new SequenceFileIterable<>(sequenceFilePath, true, getConfiguration())) { + System.out.println(record.getFirst() + "\t" + record.getSecond()); + } + } + + /** + * Determine number of records in a SequenceFile + * @param sequenceFilePath path to SequenceFile + * @return number of records + */ + private int getNumberRecords(Path sequenceFilePath) throws IOException { + int numberRecords = 0; + for (Object value : new SequenceFileValueIterable<>(sequenceFilePath, true, getConfiguration())) { + numberRecords++; + } + return numberRecords; + } + + /** + * Test map reduce version of split input with Text, Text key value + * pairs in input + */ + @Test + public void testSplitInputMapReduceText() throws Exception { + writeTextSequenceFile(tempSequenceDirectory, 1000); + testSplitInputMapReduce(1000); + } + + /** Test map reduce version of split input with Text, Text key value pairs in input called from command line */ + @Test + public void testSplitInputMapReduceTextCli() throws Exception { + writeTextSequenceFile(tempSequenceDirectory, 1000); + testSplitInputMapReduceCli(1000); + } + + /** + * Test map reduce version of split input with IntWritable, Vector key value + * pairs in input + */ + @Test + public void testSplitInputMapReduceVector() throws Exception { + writeVectorSequenceFile(tempSequenceDirectory, 1000); + testSplitInputMapReduce(1000); + } + + /** + * Test map reduce version of split input with IntWritable, Vector key value + * pairs in input called from command line + */ + @Test + public void testSplitInputMapReduceVectorCli() throws Exception { + writeVectorSequenceFile(tempSequenceDirectory, 1000); + testSplitInputMapReduceCli(1000); + } + + /** + * Test map reduce version of split input through CLI + */ + private void testSplitInputMapReduceCli(int numPoints) throws Exception { + int randomSelectionPct = 25; + int keepPct = 10; + String[] args = + { "--method", "mapreduce", "--input", tempSequenceDirectory.toString(), + "--mapRedOutputDir", tempMapRedOutputDirectory.toString(), + "--randomSelectionPct", Integer.toString(randomSelectionPct), + "--keepPct", Integer.toString(keepPct), "-ow" }; + ToolRunner.run(getConfiguration(), new SplitInput(), args); + validateSplitInputMapReduce(numPoints, randomSelectionPct, keepPct); + } + + /** + * Test map reduce version of split input through method call + */ + private void testSplitInputMapReduce(int numPoints) throws Exception { + int randomSelectionPct = 25; + si.setTestRandomSelectionPct(randomSelectionPct); + int keepPct = 10; + si.setKeepPct(keepPct); + si.setMapRedOutputDirectory(tempMapRedOutputDirectory); + si.setUseMapRed(true); + si.splitDirectory(getConfiguration(), tempSequenceDirectory); + + validateSplitInputMapReduce(numPoints, randomSelectionPct, keepPct); + } + + /** + * Validate that number of test records and number of training records + * are consistant with keepPct and randomSelectionPct + */ + private void validateSplitInputMapReduce(int numPoints, int randomSelectionPct, int keepPct) throws IOException { + Path testPath = new Path(tempMapRedOutputDirectory, "test-r-00000"); + Path trainingPath = new Path(tempMapRedOutputDirectory, "training-r-00000"); + int numberTestRecords = getNumberRecords(testPath); + int numberTrainingRecords = getNumberRecords(trainingPath); + System.out.printf("Test data: %d records\n", numberTestRecords); + displaySequenceFile(testPath); + System.out.printf("Training data: %d records\n", numberTrainingRecords); + displaySequenceFile(trainingPath); + assertEquals((randomSelectionPct / 100.0) * (keepPct / 100.0) * numPoints, + numberTestRecords, 2); + assertEquals( + (1 - randomSelectionPct / 100.0) * (keepPct / 100.0) * numPoints, + numberTrainingRecords, 2); + } + + @Test + public void testValidate() throws Exception { + SplitInput st = new SplitInput(); + assertValidateException(st); + + st.setTestSplitSize(100); + assertValidateException(st); + + st.setTestOutputDirectory(tempTestDirectory); + assertValidateException(st); + + st.setTrainingOutputDirectory(tempTrainingDirectory); + st.validate(); + + st.setTestSplitPct(50); + assertValidateException(st); + + st = new SplitInput(); + st.setTestRandomSelectionPct(50); + st.setTestOutputDirectory(tempTestDirectory); + st.setTrainingOutputDirectory(tempTrainingDirectory); + st.validate(); + + st.setTestSplitPct(50); + assertValidateException(st); + + st = new SplitInput(); + st.setTestRandomSelectionPct(50); + st.setTestOutputDirectory(tempTestDirectory); + st.setTrainingOutputDirectory(tempTrainingDirectory); + st.validate(); + + st.setTestSplitSize(100); + assertValidateException(st); + } + + private class TestCallback implements SplitInput.SplitCallback { + private final int testSplitSize; + private final int trainingLines; + + private TestCallback(int testSplitSize, int trainingLines) { + this.testSplitSize = testSplitSize; + this.trainingLines = trainingLines; + } + + @Override + public void splitComplete(Path inputFile, int lineCount, int trainCount, int testCount, int testSplitStart) { + assertSplit(fs, tempInputFile, charset, testSplitSize, trainingLines, tempTrainingDirectory, tempTestDirectory); + } + } + + private static void assertValidateException(SplitInput st) throws IOException { + try { + st.validate(); + fail("Expected IllegalArgumentException"); + } catch (IllegalArgumentException iae) { + // good + } + } + + private static void assertSplit(FileSystem fs, + Path tempInputFile, + Charset charset, + int testSplitSize, + int trainingLines, + Path tempTrainingDirectory, + Path tempTestDirectory) { + + try { + Path testFile = new Path(tempTestDirectory, tempInputFile.getName()); + //assertTrue("test file exists", testFile.isFile()); + assertEquals("test line count", testSplitSize, SplitInput.countLines(fs, testFile, charset)); + + Path trainingFile = new Path(tempTrainingDirectory, tempInputFile.getName()); + //assertTrue("training file exists", trainingFile.isFile()); + assertEquals("training line count", trainingLines, SplitInput.countLines(fs, trainingFile, charset)); + } catch (IOException ioe) { + fail(ioe.toString()); + } + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/email/MailProcessorTest.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/email/MailProcessorTest.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/email/MailProcessorTest.java new file mode 100644 index 0000000..c519f85 --- /dev/null +++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/email/MailProcessorTest.java @@ -0,0 +1,72 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.utils.email; + +import java.io.File; +import java.io.StringWriter; +import java.net.URL; +import java.util.regex.Pattern; + +import org.apache.commons.io.Charsets; +import org.apache.mahout.common.MahoutTestCase; +import org.junit.Test; + +public final class MailProcessorTest extends MahoutTestCase { + + @Test + public void testLabel() throws Exception { + StringWriter writer = new StringWriter(); + MailOptions options = new MailOptions(); + options.setSeparator(":::"); + options.setCharset(Charsets.UTF_8); + options.setPatternsToMatch(new Pattern[]{ + MailProcessor.FROM_PREFIX, MailProcessor.SUBJECT_PREFIX, MailProcessor.TO_PREFIX}); + options.setInput(new File(System.getProperty("user.dir"))); + MailProcessor proc = new MailProcessor(options, "", writer); + URL url = MailProcessorTest.class.getClassLoader().getResource("test.mbox"); + File file = new File(url.toURI()); + long count = proc.parseMboxLineByLine(file); + assertEquals(7, count); + } + + @Test + public void testStripQuoted() throws Exception { + StringWriter writer = new StringWriter(); + MailOptions options = new MailOptions(); + options.setSeparator(":::"); + options.setCharset(Charsets.UTF_8); + options.setPatternsToMatch(new Pattern[]{ + MailProcessor.SUBJECT_PREFIX}); + options.setInput(new File(System.getProperty("user.dir"))); + options.setIncludeBody(true); + MailProcessor proc = new MailProcessor(options, "", writer); + URL url = MailProcessorTest.class.getClassLoader().getResource("test.mbox"); + File file = new File(url.toURI()); + long count = proc.parseMboxLineByLine(file); + assertEquals(7, count); + assertTrue(writer.getBuffer().toString().contains("> Cocoon Cron Block Configurable Clustering")); + writer = new StringWriter(); + proc = new MailProcessor(options, "", writer); + options.setStripQuotedText(true); + count = proc.parseMboxLineByLine(file); + assertEquals(7, count); + assertFalse(writer.getBuffer().toString().contains("> Cocoon Cron Block Configurable Clustering")); + + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java new file mode 100644 index 0000000..4fdbbbc --- /dev/null +++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java @@ -0,0 +1,154 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.mahout.utils.nlp.collocations.llr; + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharsetEncoder; + +import org.apache.commons.io.Charsets; +import org.apache.hadoop.util.bloom.BloomFilter; +import org.apache.hadoop.util.bloom.Filter; +import org.apache.hadoop.util.bloom.Key; +import org.apache.hadoop.util.hash.Hash; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.WhitespaceAnalyzer; +import org.apache.lucene.analysis.shingle.ShingleFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.mahout.common.MahoutTestCase; +import org.junit.Test; + +public final class BloomTokenFilterTest extends MahoutTestCase { + + private static final CharsetEncoder encoder = Charsets.UTF_8.newEncoder(); + + private static final String input = "The best of times the worst of times"; + private static final String[] allTokens = { + "The", "best", "of", "times", "the", "worst", "of", "times" + }; + private static final String[] expectedNonKeepTokens = { "best", "times", "the", "worst", "times" }; + private static final String[] expectedKeepTokens = { "The", "of", "of" }; + private static final String[] filterTokens = { "The", "of" }; + private static final String[] notFilterTokens = { "best", "worst", "the", "times"}; + private static final String[] shingleKeepTokens = { + "The best", "best of times", "the worst", "worst of times", "of times" + }; + private static final String[] expectedShingleTokens = { + "The best", "best of times", "of times", "the worst", "worst of times", "of times" + }; + + /** test standalone filter without tokenfilter wrapping */ + @Test + public void testFilter() throws IOException { + Filter filter = getFilter(filterTokens); + Key k = new Key(); + for (String s: filterTokens) { + setKey(k,s); + assertTrue("Key for string " + s + " should be filter member", filter.membershipTest(k)); + } + + for (String s: notFilterTokens) { + setKey(k,s); + assertFalse("Key for string " + s + " should not be filter member", filter.membershipTest(k)); + } + } + + /** normal case, unfiltered analyzer */ + @Test + public void testAnalyzer() throws IOException { + Reader reader = new StringReader(input); + Analyzer analyzer = new WhitespaceAnalyzer(); + TokenStream ts = analyzer.tokenStream(null, reader); + ts.reset(); + validateTokens(allTokens, ts); + ts.end(); + ts.close(); + } + + /** filtered analyzer */ + @Test + public void testNonKeepdAnalyzer() throws IOException { + Reader reader = new StringReader(input); + Analyzer analyzer = new WhitespaceAnalyzer(); + TokenStream ts = analyzer.tokenStream(null, reader); + ts.reset(); + TokenStream f = new BloomTokenFilter(getFilter(filterTokens), false /* toss matching tokens */, ts); + validateTokens(expectedNonKeepTokens, f); + ts.end(); + ts.close(); + } + + /** keep analyzer */ + @Test + public void testKeepAnalyzer() throws IOException { + Reader reader = new StringReader(input); + Analyzer analyzer = new WhitespaceAnalyzer(); + TokenStream ts = analyzer.tokenStream(null, reader); + ts.reset(); + TokenStream f = new BloomTokenFilter(getFilter(filterTokens), true /* keep matching tokens */, ts); + validateTokens(expectedKeepTokens, f); + ts.end(); + ts.close(); + } + + /** shingles, keep those matching whitelist */ + @Test + public void testShingleFilteredAnalyzer() throws IOException { + Reader reader = new StringReader(input); + Analyzer analyzer = new WhitespaceAnalyzer(); + TokenStream ts = analyzer.tokenStream(null, reader); + ts.reset(); + ShingleFilter sf = new ShingleFilter(ts, 3); + TokenStream f = new BloomTokenFilter(getFilter(shingleKeepTokens), true, sf); + validateTokens(expectedShingleTokens, f); + ts.end(); + ts.close(); + } + + private static void setKey(Key k, String s) throws IOException { + ByteBuffer buffer = encoder.encode(CharBuffer.wrap(s.toCharArray())); + k.set(buffer.array(), 1.0); + } + + private static void validateTokens(String[] expected, TokenStream ts) throws IOException { + int pos = 0; + while (ts.incrementToken()) { + assertTrue("Analyzer produced too many tokens", pos <= expected.length); + CharTermAttribute termAttr = ts.getAttribute(CharTermAttribute.class); + assertEquals("Unexpected term", expected[pos++], termAttr.toString()); + } + assertEquals("Analyzer produced too few terms", expected.length, pos); + } + + private static Filter getFilter(String[] tokens) throws IOException { + Filter filter = new BloomFilter(100,50, Hash.JENKINS_HASH); + Key k = new Key(); + for (String s: tokens) { + setKey(k,s); + filter.add(k); + } + return filter; + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java new file mode 100644 index 0000000..8ab643b --- /dev/null +++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java @@ -0,0 +1,104 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.utils.regex; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.mahout.common.DummyRecordWriter; +import org.apache.mahout.common.MahoutTestCase; +import org.junit.Test; + +import java.util.List; + +public final class RegexMapperTest extends MahoutTestCase { + + @Test + public void testRegex() throws Exception { + RegexMapper mapper = new RegexMapper(); + Configuration conf = getConfiguration(); + conf.set(RegexMapper.REGEX, "(?<=(\\?|&)q=).*?(?=&|$)"); + conf.set(RegexMapper.TRANSFORMER_CLASS, URLDecodeTransformer.class.getName()); + DummyRecordWriter<LongWritable, Text> mapWriter = new DummyRecordWriter<>(); + Mapper<LongWritable, Text, LongWritable, Text>.Context mapContext = DummyRecordWriter + .build(mapper, conf, mapWriter); + + mapper.setup(mapContext); + for (int i = 0; i < RegexUtilsTest.TEST_STRS.length; i++) { + String testStr = RegexUtilsTest.TEST_STRS[i]; + + LongWritable key = new LongWritable(i); + mapper.map(key, new Text(testStr), mapContext); + List<Text> value = mapWriter.getValue(key); + if (!RegexUtilsTest.GOLD[i].isEmpty()) { + assertEquals(1, value.size()); + assertEquals(RegexUtilsTest.GOLD[i], value.get(0).toString()); + } + } + } + + @Test + public void testGroups() throws Exception { + RegexMapper mapper = new RegexMapper(); + Configuration conf = getConfiguration(); + conf.set(RegexMapper.REGEX, "(\\d+)\\.(\\d+)\\.(\\d+)"); + conf.set(RegexMapper.TRANSFORMER_CLASS, URLDecodeTransformer.class.getName()); + conf.setStrings(RegexMapper.GROUP_MATCHERS, "1", "3"); + DummyRecordWriter<LongWritable, Text> mapWriter = new DummyRecordWriter<>(); + Mapper<LongWritable, Text, LongWritable, Text>.Context mapContext = DummyRecordWriter + .build(mapper, conf, mapWriter); + + mapper.setup(mapContext); + for (int i = 0; i < RegexUtilsTest.TEST_STRS.length; i++) { + String testStr = RegexUtilsTest.TEST_STRS[i]; + + LongWritable key = new LongWritable(i); + mapper.map(key, new Text(testStr), mapContext); + List<Text> value = mapWriter.getValue(key); + assertEquals(1, value.size()); + assertEquals("127 0", value.get(0).toString()); + } + } + + @Test + public void testFPGFormatter() throws Exception { + RegexMapper mapper = new RegexMapper(); + Configuration conf = getConfiguration(); + conf.set(RegexMapper.REGEX, "(?<=(\\?|&)q=).*?(?=&|$)"); + conf.set(RegexMapper.TRANSFORMER_CLASS, URLDecodeTransformer.class.getName()); + conf.set(RegexMapper.FORMATTER_CLASS, FPGFormatter.class.getName()); + DummyRecordWriter<LongWritable, Text> mapWriter = new DummyRecordWriter<>(); + Mapper<LongWritable, Text, LongWritable, Text>.Context mapContext = DummyRecordWriter + .build(mapper, conf, mapWriter); + + mapper.setup(mapContext); + RegexFormatter formatter = new FPGFormatter(); + for (int i = 0; i < RegexUtilsTest.TEST_STRS.length; i++) { + String testStr = RegexUtilsTest.TEST_STRS[i]; + + LongWritable key = new LongWritable(i); + mapper.map(key, new Text(testStr), mapContext); + List<Text> value = mapWriter.getValue(key); + if (!RegexUtilsTest.GOLD[i].isEmpty()) { + assertEquals(1, value.size()); + assertEquals(formatter.format(RegexUtilsTest.GOLD[i]), value.get(0).toString()); + } + } + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java new file mode 100644 index 0000000..8ae10a5 --- /dev/null +++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.utils.regex; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.regex.Pattern; + +import org.apache.mahout.common.MahoutTestCase; +import org.junit.Test; + +public final class RegexUtilsTest extends MahoutTestCase { + + static final String[] TEST_STRS = { + "127.0.0.1 - - [01/10/2011:00:01:51 +0000] \"GET /solr/collection1/browse?q=foo&rows=10&wt=json&hl=true&hl.fl=body&hl.fl=content", + "127.0.0.1 - - [01/10/2011:00:20:58 +0000] \"GET /solr/collection1/browse?q=Using+Solr+Search+RDBMS&fq=%7B%21tag%3Dsource%7D%28%28source%3Alucid+AND+lucid_facet%3A%28site%29%29%29&rows=10", + "127.0.0.1 - - [01/10/2011:00:21:21 +0000] \"GET /solr/collection1/browse?q=language+detection&start=560&rows=10 HTTP/1.1\" 200 45071", + "127.0.0.1 - - [01/10/2011:00:21:21 +0000] \"GET /solr/collection1/browse?q=&start=560&rows=10 HTTP/1.1\" 200 45071" + }; + static final String[] GOLD = {"foo", "Using Solr Search RDBMS", "language detection", ""}; + + @Test + public void testExtract() throws Exception { + Pattern pattern = Pattern.compile("(?<=(\\?|&)q=).*?(?=&|$)"); + String line = "127.0.0.1 - - [24/05/2010:01:19:22 +0000] \"GET /solr/select?q=import statement&start=1 HTTP/1.1\" 200 37571"; + String res = RegexUtils.extract(line, pattern, Collections.<Integer>emptyList(), " ", RegexUtils.IDENTITY_TRANSFORMER); + assertEquals(res, "import statement", res); + + for (int i = 0; i < TEST_STRS.length; i++) { + String testStr = TEST_STRS[i]; + res = RegexUtils.extract(testStr, pattern, Collections.<Integer>emptyList(), " ", new URLDecodeTransformer()); + assertEquals(GOLD[i], res); + } + + pattern = Pattern.compile("((?<=(\\?|&)q=)(.*?)(?=(&|$))|(?<=((\\?|&)start=))(\\d+))"); + res = RegexUtils.extract(line, pattern, Collections.<Integer>emptyList(), " ", RegexUtils.IDENTITY_TRANSFORMER); + assertEquals(res, "import statement 1", res); + + pattern = Pattern.compile("(start=1) HTTP"); + Collection<Integer> groupsToKeep = new ArrayList<>(); + groupsToKeep.add(1); + res = RegexUtils.extract(line, pattern, groupsToKeep, " ", RegexUtils.IDENTITY_TRANSFORMER); + assertEquals(res, "start=1", res); + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java new file mode 100644 index 0000000..2ddce14 --- /dev/null +++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java @@ -0,0 +1,73 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.utils.vectors; + +import java.util.Iterator; +import java.util.Random; + +import com.google.common.base.Function; +import com.google.common.collect.Iterators; +import org.apache.mahout.common.RandomUtils; +import org.apache.mahout.common.iterator.CountingIterator; +import org.apache.mahout.math.DenseVector; +import org.apache.mahout.math.RandomAccessSparseVector; +import org.apache.mahout.math.Vector; +import org.apache.mahout.math.function.DoubleFunction; + +public final class RandomVectorIterable implements Iterable<Vector> { + + public enum VectorType {DENSE, SPARSE} + + private final int numItems; + private final VectorType type; + + public RandomVectorIterable() { + this(100, VectorType.SPARSE); + } + + public RandomVectorIterable(int numItems) { + this(numItems, VectorType.SPARSE); + } + + public RandomVectorIterable(int numItems, VectorType type) { + this.numItems = numItems; + this.type = type; + } + + @Override + public Iterator<Vector> iterator() { + return Iterators.transform( + new CountingIterator(numItems), + new Function<Integer, Vector>() { + private final Random random = RandomUtils.getRandom(); + @Override + public Vector apply(Integer dummy) { + Vector result = + type == VectorType.SPARSE ? new RandomAccessSparseVector(numItems) : new DenseVector(numItems); + result.assign(new DoubleFunction() { + @Override + public double apply(double ignored) { + return random.nextDouble(); + } + }); + return result; + } + }); + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java new file mode 100644 index 0000000..c55fd8d --- /dev/null +++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java @@ -0,0 +1,140 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.utils.vectors; + +import java.util.Random; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.apache.mahout.common.MahoutTestCase; +import org.apache.mahout.common.RandomUtils; +import org.apache.mahout.math.SequentialAccessSparseVector; +import org.apache.mahout.math.Vector; +import org.junit.Before; +import org.junit.Test; + +public final class VectorHelperTest extends MahoutTestCase { + + private static final int NUM_DOCS = 100; + + private Path inputPathOne; + private Path inputPathTwo; + + private Configuration conf; + + @Override + @Before + public void setUp() throws Exception { + super.setUp(); + conf = getConfiguration(); + + inputPathOne = getTestTempFilePath("documents/docs-one.file"); + FileSystem fs = FileSystem.get(inputPathOne.toUri(), conf); + try (SequenceFile.Writer writer = + new SequenceFile.Writer(fs, conf, inputPathOne, Text.class, IntWritable.class)) { + Random rd = RandomUtils.getRandom(); + for (int i = 0; i < NUM_DOCS; i++) { + // Make all indices higher than dictionary size + writer.append(new Text("Document::ID::" + i), new IntWritable(NUM_DOCS + rd.nextInt(NUM_DOCS))); + } + } + + inputPathTwo = getTestTempFilePath("documents/docs-two.file"); + fs = FileSystem.get(inputPathTwo.toUri(), conf); + try (SequenceFile.Writer writer = + new SequenceFile.Writer(fs, conf, inputPathTwo, Text.class, IntWritable.class)) { + Random rd = RandomUtils.getRandom(); + for (int i = 0; i < NUM_DOCS; i++) { + // Keep indices within number of documents + writer.append(new Text("Document::ID::" + i), new IntWritable(rd.nextInt(NUM_DOCS))); + } + } + } + + @Test + public void testJsonFormatting() throws Exception { + Vector v = new SequentialAccessSparseVector(10); + v.set(2, 3.1); + v.set(4, 1.0); + v.set(6, 8.1); + v.set(7, -100); + v.set(9, 12.2); + String UNUSED = "UNUSED"; + String[] dictionary = { + UNUSED, UNUSED, "two", UNUSED, "four", UNUSED, "six", "seven", UNUSED, "nine" + }; + + assertEquals("sorted json form incorrect: ", "{nine:12.2,six:8.1,two:3.1}", + VectorHelper.vectorToJson(v, dictionary, 3, true)); + assertEquals("unsorted form incorrect: ", "{two:3.1,four:1.0}", + VectorHelper.vectorToJson(v, dictionary, 2, false)); + assertEquals("sorted json form incorrect: ", "{nine:12.2,six:8.1,two:3.1,four:1.0}", + VectorHelper.vectorToJson(v, dictionary, 4, true)); + assertEquals("sorted json form incorrect: ", "{nine:12.2,six:8.1,two:3.1,four:1.0,seven:-100.0}", + VectorHelper.vectorToJson(v, dictionary, 5, true)); + assertEquals("sorted json form incorrect: ", "{nine:12.2,six:8.1}", + VectorHelper.vectorToJson(v, dictionary, 2, true)); + assertEquals("unsorted form incorrect: ", "{two:3.1,four:1.0}", + VectorHelper.vectorToJson(v, dictionary, 2, false)); + } + + @Test + public void testTopEntries() throws Exception { + Vector v = new SequentialAccessSparseVector(10); + v.set(2, 3.1); + v.set(4, 1.0); + v.set(6, 8.1); + v.set(7, -100); + v.set(9, 12.2); + v.set(1, 0.0); + v.set(3, 0.0); + v.set(8, 2.7); + // check if sizeOFNonZeroElementsInVector = maxEntries + assertEquals(6, VectorHelper.topEntries(v, 6).size()); + // check if sizeOfNonZeroElementsInVector < maxEntries + assertTrue(VectorHelper.topEntries(v, 9).size() < 9); + // check if sizeOfNonZeroElementsInVector > maxEntries + assertTrue(VectorHelper.topEntries(v, 5).size() < v.getNumNonZeroElements()); + } + + @Test + public void testTopEntriesWhenAllZeros() throws Exception { + Vector v = new SequentialAccessSparseVector(10); + v.set(2, 0.0); + v.set(4, 0.0); + v.set(6, 0.0); + v.set(7, 0); + v.set(9, 0.0); + v.set(1, 0.0); + v.set(3, 0.0); + v.set(8, 0.0); + assertEquals(0, VectorHelper.topEntries(v, 6).size()); + } + + @Test + public void testLoadTermDictionary() throws Exception { + // With indices higher than dictionary size + VectorHelper.loadTermDictionary(conf, inputPathOne.toString()); + // With dictionary size higher than indices + VectorHelper.loadTermDictionary(conf, inputPathTwo.toString()); + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java new file mode 100644 index 0000000..2ea8b89 --- /dev/null +++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java @@ -0,0 +1,35 @@ +/* + * Copyright 2013 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.utils.vectors.arff; + +import org.apache.mahout.common.MahoutTestCase; +import org.junit.Test; + +public final class ARFFTypeTest extends MahoutTestCase { + + @Test + public void removeQuotes() { + assertNull(ARFFType.removeQuotes(null)); + assertEquals("", ARFFType.removeQuotes("\"\"")); + assertEquals("", ARFFType.removeQuotes("''")); + assertEquals("", ARFFType.removeQuotes("")); + assertEquals("", ARFFType.removeQuotes(" ")); + assertEquals("single", ARFFType.removeQuotes("'single'")); + assertEquals("double", ARFFType.removeQuotes("\"double\"")); + assertEquals("trim", ARFFType.removeQuotes(" trim ")); + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java new file mode 100644 index 0000000..4c7f17a --- /dev/null +++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java @@ -0,0 +1,289 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.utils.vectors.arff; + +import java.io.IOException; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.Iterator; +import java.util.Locale; +import java.util.Map; + +import com.google.common.io.Resources; +import org.apache.commons.io.Charsets; +import org.apache.mahout.common.MahoutTestCase; +import org.apache.mahout.math.DenseVector; +import org.apache.mahout.math.RandomAccessSparseVector; +import org.apache.mahout.math.Vector; +import org.junit.Test; + +public final class ARFFVectorIterableTest extends MahoutTestCase { + + @Test + public void testValues() throws Exception { + ARFFVectorIterable iterable = readModelFromResource("sample.arff"); + + assertEquals("Mahout", iterable.getModel().getRelation()); + Map<String, Integer> bindings = iterable.getModel().getLabelBindings(); + assertNotNull(bindings); + assertEquals(5, bindings.size()); + Iterator<Vector> iter = iterable.iterator(); + assertTrue(iter.hasNext()); + Vector next = iter.next(); + assertNotNull(next); + assertTrue("Wrong instanceof", next instanceof DenseVector); + assertEquals(1.0, next.get(0), EPSILON); + assertEquals(2.0, next.get(1), EPSILON); + assertTrue(iter.hasNext()); + next = iter.next(); + assertNotNull(next); + assertTrue("Wrong instanceof", next instanceof DenseVector); + assertEquals(2.0, next.get(0), EPSILON); + assertEquals(3.0, next.get(1), EPSILON); + + assertTrue(iter.hasNext()); + next = iter.next(); + assertNotNull(next); + assertTrue("Wrong instanceof", next instanceof RandomAccessSparseVector); + assertEquals(5.0, next.get(0), EPSILON); + assertEquals(23.0, next.get(1), EPSILON); + + assertFalse(iter.hasNext()); + } + + @Test + public void testDense() throws Exception { + Iterable<Vector> iterable = readModelFromResource("sample-dense.arff"); + Vector firstVector = iterable.iterator().next(); + assertEquals(1.0, firstVector.get(0), 0); + assertEquals(65.0, firstVector.get(1), 0); + assertEquals(1.0, firstVector.get(3), 0); + assertEquals(1.0, firstVector.get(4), 0); + + int count = 0; + for (Vector vector : iterable) { + assertTrue("Vector is not dense", vector instanceof DenseVector); + count++; + } + assertEquals(5, count); + } + + @Test + public void testSparse() throws Exception { + Iterable<Vector> iterable = readModelFromResource("sample-sparse.arff"); + + Vector firstVector = iterable.iterator().next(); + assertEquals(23.1, firstVector.get(1), 0); + assertEquals(3.23, firstVector.get(2), 0); + assertEquals(1.2, firstVector.get(3), 0); + + int count = 0; + for (Vector vector : iterable) { + assertTrue("Vector is not dense", vector instanceof RandomAccessSparseVector); + count++; + } + assertEquals(9, count); + } + + @Test + public void testNonNumeric() throws Exception { + MapBackedARFFModel model = new MapBackedARFFModel(); + ARFFVectorIterable iterable = getVectors("non-numeric-1.arff", model); + int count = 0; + for (Vector vector : iterable) { + assertTrue("Vector is not dense", vector instanceof RandomAccessSparseVector); + count++; + } + + iterable = getVectors("non-numeric-1.arff", model); + Iterator<Vector> iter = iterable.iterator(); + Vector firstVector = iter.next(); + + assertEquals(1.0, firstVector.get(2), 0); + + assertEquals(10, count); + Map<String, Map<String, Integer>> nominalMap = iterable.getModel().getNominalMap(); + assertNotNull(nominalMap); + assertEquals(1, nominalMap.size()); + Map<String, Integer> noms = nominalMap.get("bar"); + assertNotNull("nominals for bar are null", noms); + assertEquals(5, noms.size()); + Map<Integer, ARFFType> integerARFFTypeMap = model.getTypeMap(); + assertNotNull("Type map null", integerARFFTypeMap); + assertEquals(5, integerARFFTypeMap.size()); + Map<String, Long> words = model.getWords(); + assertNotNull("words null", words); + assertEquals(10, words.size()); + Map<Integer, DateFormat> integerDateFormatMap = model.getDateMap(); + assertNotNull("date format null", integerDateFormatMap); + assertEquals(1, integerDateFormatMap.size()); + } + + @Test + public void testDate() throws Exception { + ARFFVectorIterable iterable = readModelFromResource("date.arff"); + Iterator<Vector> iter = iterable.iterator(); + Vector firstVector = iter.next(); + + DateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH); + Date date = format.parse("2001-07-04T12:08:56"); + long result = date.getTime(); + assertEquals(result, firstVector.get(1), 0); + + format = new SimpleDateFormat("yyyy.MM.dd G 'at' HH:mm:ss z", Locale.ENGLISH); + date = format.parse("2001.07.04 AD at 12:08:56 PDT"); + result = date.getTime(); + assertEquals(result, firstVector.get(2), 0); + + format = new SimpleDateFormat("EEE, MMM d, ''yy", Locale.ENGLISH); + date = format.parse("Wed, Jul 4, '01,4 0:08 PM, PDT"); + result = date.getTime(); + assertEquals(result, firstVector.get(3), 0); + + format = new SimpleDateFormat("K:mm a, z", Locale.ENGLISH); + date = format.parse("0:08 PM, PDT"); + result = date.getTime(); + assertEquals(result, firstVector.get(4), 0); + + format = new SimpleDateFormat("yyyyy.MMMMM.dd GGG hh:mm aaa", Locale.ENGLISH); + date = format.parse("02001.July.04 AD 12:08 PM"); + result = date.getTime(); + assertEquals(result, firstVector.get(5), 0); + + format = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.ENGLISH); + date = format.parse("Wed, 4 Jul 2001 12:08:56 -0700"); + result = date.getTime(); + assertEquals(result, firstVector.get(6), 0); + + } + + @Test + public void testMultipleNoms() throws Exception { + MapBackedARFFModel model = new MapBackedARFFModel(); + ARFFVectorIterable iterable = getVectors("non-numeric-1.arff", model); + int count = 0; + for (Vector vector : iterable) { + assertTrue("Vector is not dense", vector instanceof RandomAccessSparseVector); + count++; + } + assertEquals(10, count); + Map<String,Map<String,Integer>> nominalMap = iterable.getModel().getNominalMap(); + assertNotNull(nominalMap); + assertEquals(1, nominalMap.size()); + Map<String,Integer> noms = nominalMap.get("bar"); + assertNotNull("nominals for bar are null", noms); + assertEquals(5, noms.size()); + Map<Integer,ARFFType> integerARFFTypeMap = model.getTypeMap(); + assertNotNull("Type map null", integerARFFTypeMap); + assertEquals(5, integerARFFTypeMap.size()); + Map<String,Long> words = model.getWords(); + assertNotNull("words null", words); + assertEquals(10, words.size()); + + Map<Integer,DateFormat> integerDateFormatMap = model.getDateMap(); + assertNotNull("date format null", integerDateFormatMap); + assertEquals(1, integerDateFormatMap.size()); + + + iterable = getVectors("non-numeric-2.arff", model); + count = 0; + for (Vector vector : iterable) { + assertTrue("Vector is not dense", vector instanceof RandomAccessSparseVector); + count++; + } + nominalMap = model.getNominalMap(); + assertNotNull(nominalMap); + assertEquals(2, nominalMap.size()); + noms = nominalMap.get("test"); + assertNotNull("nominals for bar are null", noms); + assertEquals(2, noms.size()); + } + + @Test + public void testNumerics() throws Exception { + String arff = "@RELATION numerics\n" + + "@ATTRIBUTE theNumeric NUMERIC\n" + + "@ATTRIBUTE theInteger INTEGER\n" + + "@ATTRIBUTE theReal REAL\n" + + "@DATA\n" + + "1.0,2,3.0"; + ARFFModel model = new MapBackedARFFModel(); + ARFFVectorIterable iterable = new ARFFVectorIterable(arff, model); + model = iterable.getModel(); + assertNotNull(model); + assertEquals(3, model.getLabelSize()); + assertEquals(ARFFType.NUMERIC, model.getARFFType(0)); + assertEquals(ARFFType.INTEGER, model.getARFFType(1)); + assertEquals(ARFFType.REAL, model.getARFFType(2)); + Iterator<Vector> it = iterable.iterator(); + Vector vector = it.next(); + assertEquals(1.0, vector.get(0), EPSILON); + assertEquals(2.0, vector.get(1), EPSILON); + assertEquals(3.0, vector.get(2), EPSILON); + } + + @Test + public void testQuotes() throws Exception { + // ARFF allows quotes on identifiers + ARFFModel model = new MapBackedARFFModel(); + ARFFVectorIterable iterable = getVectors("quoted-id.arff", model); + model = iterable.getModel(); + assertNotNull(model); + assertEquals("quotes", model.getRelation()); + + // check attribute labels + assertEquals(4, model.getLabelSize()); + assertEquals(ARFFType.NUMERIC, model.getARFFType(0)); + assertEquals(ARFFType.INTEGER, model.getARFFType(1)); + assertEquals(ARFFType.REAL, model.getARFFType(2)); + assertEquals(ARFFType.NOMINAL, model.getARFFType(3)); + + Map<String, Integer> labelBindings = model.getLabelBindings(); + assertTrue(labelBindings.keySet().contains("thenumeric")); + assertTrue(labelBindings.keySet().contains("theinteger")); + assertTrue(labelBindings.keySet().contains("thereal")); + assertTrue(labelBindings.keySet().contains("thenominal")); + + // check nominal values + Map<String, Integer> nominalMap = model.getNominalMap().get("thenominal"); + assertNotNull(nominalMap); + assertEquals(3, nominalMap.size()); + assertTrue(nominalMap.keySet().contains("double-quote")); + assertTrue(nominalMap.keySet().contains("single-quote")); + assertTrue(nominalMap.keySet().contains("no-quote")); + + // check data values + Iterator<Vector> it = iterable.iterator(); + Vector vector = it.next(); + assertEquals(nominalMap.get("no-quote"), vector.get(3), EPSILON); + assertEquals(nominalMap.get("single-quote"), it.next().get(3), EPSILON); + assertEquals(nominalMap.get("double-quote"), it.next().get(3), EPSILON); + } + + static ARFFVectorIterable getVectors(String resourceName, ARFFModel model) throws IOException { + String sample = Resources.toString(Resources.getResource(resourceName), Charsets.UTF_8); + return new ARFFVectorIterable(sample, model); + } + + private static ARFFVectorIterable readModelFromResource(String resourceName) throws IOException { + ARFFModel model = new MapBackedARFFModel(); + return getVectors(resourceName, model); + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/DriverTest.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/DriverTest.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/DriverTest.java new file mode 100644 index 0000000..7e7623e --- /dev/null +++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/DriverTest.java @@ -0,0 +1,54 @@ +/* + * Copyright 2013 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.mahout.utils.vectors.arff; + +import java.io.IOException; +import java.io.StringWriter; + +import com.google.common.io.Resources; +import org.apache.commons.io.Charsets; +import org.apache.mahout.common.MahoutTestCase; +import org.junit.Test; + +/** + * Test case for {@link Driver} + */ +public class DriverTest extends MahoutTestCase { + + @Test + public void dictionary() throws IOException { + + ARFFModel model = new MapBackedARFFModel(); + ARFFVectorIterableTest.getVectors("sample-dense.arff", model); + StringWriter writer = new StringWriter(); + Driver.writeLabelBindings(writer, model, ","); + String expected1 = Resources.toString(Resources.getResource("expected-arff-dictionary.csv"), Charsets.UTF_8); + String expected2 = Resources.toString(Resources.getResource("expected-arff-dictionary-2.csv"), Charsets.UTF_8); + assertTrue(expected1.equals(writer.toString()) || expected2.equals(writer.toString())); + } + + + @Test + public void dictionaryJSON() throws IOException { + ARFFModel model = new MapBackedARFFModel(); + ARFFVectorIterableTest.getVectors("sample-dense.arff", model); + StringWriter writer = new StringWriter(); + Driver.writeLabelBindingsJSON(writer, model); + String expected1 = Resources.toString(Resources.getResource("expected-arff-schema.json"), Charsets.UTF_8); + String expected2 = Resources.toString(Resources.getResource("expected-arff-schema-2.json"), Charsets.UTF_8); + assertTrue(expected1.equals(writer.toString()) || expected2.equals(writer.toString())); + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java new file mode 100644 index 0000000..2867640 --- /dev/null +++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java @@ -0,0 +1,60 @@ +/* + * Copyright 2013 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.utils.vectors.arff; + +import org.apache.mahout.common.MahoutTestCase; +import org.junit.Test; + +import java.util.Map; + +public class MapBackedARFFModelTest extends MahoutTestCase { + + @Test + public void processNominal() { + String windy = "windy"; + String breezy = "breezy"; + + ARFFModel model = new MapBackedARFFModel(); + model.addNominal(windy, breezy, 77); + model.addNominal(windy, "strong", 23); + model.addNominal(windy, "nuking", 55); + Map<String, Map<String, Integer>> nominalMap = model.getNominalMap(); + + assertEquals(1, nominalMap.size()); + Map<String, Integer> windyValues = nominalMap.get(windy); + assertEquals(77, windyValues.get(breezy).intValue()); + } + + @Test + public void processBadNumeric() { + ARFFModel model = new MapBackedARFFModel(); + model.addLabel("b1shkt70694difsmmmdv0ikmoh", 77); + model.addType(77, ARFFType.REAL); + assertTrue(Double.isNaN(model.getValue("b1shkt70694difsmmmdv0ikmoh", 77))); + } + + @Test + public void processGoodNumeric() { + ARFFModel model = new MapBackedARFFModel(); + model.addLabel("1234", 77); + model.addType(77, ARFFType.INTEGER); + assertTrue(1234 == model.getValue("1234", 77)); + model.addLabel("131.34", 78); + model.addType(78, ARFFType.REAL); + assertTrue(131.34 == model.getValue("131.34", 78)); + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIteratorTest.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIteratorTest.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIteratorTest.java new file mode 100644 index 0000000..e76cf70 --- /dev/null +++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIteratorTest.java @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.utils.vectors.csv; + +import java.io.IOException; +import java.io.StringReader; +import java.io.StringWriter; +import java.util.Iterator; + +import org.apache.mahout.common.MahoutTestCase; +import org.apache.mahout.math.Vector; +import org.apache.mahout.utils.vectors.RandomVectorIterable; +import org.apache.mahout.utils.vectors.VectorHelper; +import org.apache.mahout.utils.vectors.io.TextualVectorWriter; +import org.junit.Test; + +public class CSVVectorIteratorTest extends MahoutTestCase { + + @Test + public void testCount() throws Exception { + + StringWriter sWriter = new StringWriter(); + try (TextualVectorWriter writer = new TextualVectorWriter(sWriter) { + @Override + public void write(Vector vector) throws IOException { + String vecStr = VectorHelper.vectorToCSVString(vector, false); + getWriter().write(vecStr); + } + }) { + Iterable<Vector> iter = new RandomVectorIterable(50); + writer.write(iter); + } + + Iterator<Vector> csvIter = new CSVVectorIterator(new StringReader(sWriter.getBuffer().toString())); + int count = 0; + while (csvIter.hasNext()) { + csvIter.next(); + count++; + } + assertEquals(50, count); + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java new file mode 100644 index 0000000..e2f7032 --- /dev/null +++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java @@ -0,0 +1,67 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.utils.vectors.io; + +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.Collection; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.SequenceFile; +import org.apache.mahout.common.HadoopUtil; +import org.apache.mahout.common.MahoutTestCase; +import org.apache.mahout.math.DenseVector; +import org.apache.mahout.math.Vector; +import org.apache.mahout.math.VectorWritable; +import org.apache.mahout.utils.vectors.RandomVectorIterable; +import org.junit.Test; + +public final class VectorWriterTest extends MahoutTestCase { + + @Test + public void testSFVW() throws Exception { + Path path = getTestTempFilePath("sfvw"); + Configuration conf = getConfiguration(); + FileSystem fs = FileSystem.get(conf); + SequenceFile.Writer seqWriter = new SequenceFile.Writer(fs, conf, path, LongWritable.class, VectorWritable.class); + try (SequenceFileVectorWriter writer = new SequenceFileVectorWriter(seqWriter)) { + writer.write(new RandomVectorIterable(50)); + } + + long count = HadoopUtil.countRecords(path, conf); + assertEquals(50, count); + } + + @Test + public void testTextOutputSize() throws Exception { + StringWriter strWriter = new StringWriter(); + try (VectorWriter writer = new TextualVectorWriter(strWriter)) { + Collection<Vector> vectors = new ArrayList<>(); + vectors.add(new DenseVector(new double[]{0.3, 1.5, 4.5})); + vectors.add(new DenseVector(new double[]{1.3, 1.5, 3.5})); + writer.write(vectors); + } + String buffer = strWriter.toString(); + assertNotNull(buffer); + assertFalse(buffer.isEmpty()); + + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java new file mode 100644 index 0000000..890a14b --- /dev/null +++ b/community/mahout-mr/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.utils.vectors.lucene; + + +import java.io.IOException; + +import com.google.common.io.Closeables; + +import org.apache.lucene.analysis.core.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.store.RAMDirectory; +import org.apache.mahout.common.MahoutTestCase; +import org.junit.Before; +import org.junit.Test; + +public class CachedTermInfoTest extends MahoutTestCase { + private RAMDirectory directory; + private static final String[] DOCS = { + "a a b b c c", + "a b a b a b a b", + "a b a", + "a", + "b", + "a", + "a" + }; + + private static final String[] DOCS2 = { + "d d d d", + "e e e e", + "d e d e", + "d", + "e", + "d", + "e" + }; + + @Before + public void before() throws IOException { + directory = new RAMDirectory(); + + FieldType fieldType = new FieldType(); + fieldType.setStored(false); + fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + fieldType.setTokenized(true); + fieldType.setStoreTermVectors(false); + fieldType.setStoreTermVectorPositions(false); + fieldType.setStoreTermVectorOffsets(false); + fieldType.freeze(); + + directory = createTestIndex(fieldType, directory, 0); + } + + @Test + public void test() throws Exception { + IndexReader reader = DirectoryReader.open(directory); + CachedTermInfo cti = new CachedTermInfo(reader, "content", 0, 100); + assertEquals(3, cti.totalTerms("content")); + assertNotNull(cti.getTermEntry("content", "a")); + assertNull(cti.getTermEntry("content", "e")); + //minDf + cti = new CachedTermInfo(reader, "content", 3, 100); + assertEquals(2, cti.totalTerms("content")); + assertNotNull(cti.getTermEntry("content", "a")); + assertNull(cti.getTermEntry("content", "c")); + //maxDFPercent, a is in 6 of 7 docs: numDocs * maxDfPercent / 100 < 6 to exclude, 85% should suffice to exclude a + cti = new CachedTermInfo(reader, "content", 0, 85); + assertEquals(2, cti.totalTerms("content")); + assertNotNull(cti.getTermEntry("content", "b")); + assertNotNull(cti.getTermEntry("content", "c")); + assertNull(cti.getTermEntry("content", "a")); + + + } + + static RAMDirectory createTestIndex(FieldType fieldType, + RAMDirectory directory, + int startingId) throws IOException { + IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(new WhitespaceAnalyzer())); + + try { + for (int i = 0; i < DOCS.length; i++) { + Document doc = new Document(); + Field id = new StringField("id", "doc_" + (i + startingId), Field.Store.YES); + doc.add(id); + Field text = new Field("content", DOCS[i], fieldType); + doc.add(text); + Field text2 = new Field("content2", DOCS2[i], fieldType); + doc.add(text2); + writer.addDocument(doc); + } + } finally { + Closeables.close(writer, false); + } + return directory; + } +}
