http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/io/TextualVectorWriter.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/io/TextualVectorWriter.java b/integration/src/main/java/org/apache/mahout/utils/vectors/io/TextualVectorWriter.java deleted file mode 100644 index cc27d1d..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/io/TextualVectorWriter.java +++ /dev/null @@ -1,70 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors.io; - -import java.io.IOException; -import java.io.Writer; - -import com.google.common.io.Closeables; -import org.apache.mahout.math.Vector; - -/** - * Write out the vectors to any {@link Writer} using {@link Vector#asFormatString()}, - * one per line by default. - */ -public class TextualVectorWriter implements VectorWriter { - - private final Writer writer; - - public TextualVectorWriter(Writer writer) { - this.writer = writer; - } - - protected Writer getWriter() { - return writer; - } - - @Override - public long write(Iterable<Vector> iterable) throws IOException { - return write(iterable, Long.MAX_VALUE); - } - - @Override - public long write(Iterable<Vector> iterable, long maxDocs) throws IOException { - long result = 0; - for (Vector vector : iterable) { - if (result >= maxDocs) { - break; - } - write(vector); - result++; - } - return result; - } - - @Override - public void write(Vector vector) throws IOException { - writer.write(vector.asFormatString()); - writer.write('\n'); - } - - @Override - public void close() throws IOException { - Closeables.close(writer, false); - } -}
http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java b/integration/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java deleted file mode 100644 index 923e270..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java +++ /dev/null @@ -1,52 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors.io; - -import java.io.Closeable; -import java.io.IOException; - -import org.apache.mahout.math.Vector; - -public interface VectorWriter extends Closeable { - /** - * Write all values in the Iterable to the output - * @param iterable The {@link Iterable} to loop over - * @return the number of docs written - * @throws IOException if there was a problem writing - * - */ - long write(Iterable<Vector> iterable) throws IOException; - - /** - * Write out a vector - * - * @param vector The {@link org.apache.mahout.math.Vector} to write - * @throws IOException - */ - void write(Vector vector) throws IOException; - - /** - * Write the first {@code maxDocs} to the output. - * @param iterable The {@link Iterable} to loop over - * @param maxDocs the maximum number of docs to write - * @return The number of docs written - * @throws IOException if there was a problem writing - */ - long write(Iterable<Vector> iterable, long maxDocs) throws IOException; - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java deleted file mode 100644 index ff61a70..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java +++ /dev/null @@ -1,140 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors.lucene; - -import com.google.common.collect.AbstractIterator; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.util.BytesRef; -import org.apache.mahout.math.NamedVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.utils.Bump125; -import org.apache.mahout.utils.vectors.TermInfo; -import org.apache.mahout.vectorizer.Weight; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; - -/** - * Iterate over a Lucene index, extracting term vectors. - * Subclasses define how much information to retrieve from the Lucene index. - */ -public abstract class AbstractLuceneIterator extends AbstractIterator<Vector> { - private static final Logger log = LoggerFactory.getLogger(LuceneIterator.class); - protected final IndexReader indexReader; - protected final String field; - protected final TermInfo terminfo; - protected final double normPower; - protected final Weight weight; - protected final Bump125 bump = new Bump125(); - protected int nextDocId; - protected int maxErrorDocs; - protected int numErrorDocs; - protected long nextLogRecord = bump.increment(); - protected int skippedErrorMessages; - - public AbstractLuceneIterator(TermInfo terminfo, double normPower, IndexReader indexReader, Weight weight, - double maxPercentErrorDocs, String field) { - this.terminfo = terminfo; - this.normPower = normPower; - this.indexReader = indexReader; - - this.weight = weight; - this.nextDocId = 0; - this.maxErrorDocs = (int) (maxPercentErrorDocs * indexReader.numDocs()); - this.field = field; - } - - /** - * Given the document name, derive a name for the vector. This may involve - * reading the document from Lucene and setting up any other state that the - * subclass wants. This will be called once for each document that the - * iterator processes. - * @param documentIndex the lucene document index. - * @return the name to store in the vector. - */ - protected abstract String getVectorName(int documentIndex) throws IOException; - - @Override - protected Vector computeNext() { - try { - int doc; - Terms termFreqVector; - String name; - - do { - doc = this.nextDocId; - nextDocId++; - - if (doc >= indexReader.maxDoc()) { - return endOfData(); - } - - termFreqVector = indexReader.getTermVector(doc, field); - name = getVectorName(doc); - - if (termFreqVector == null) { - numErrorDocs++; - if (numErrorDocs >= maxErrorDocs) { - log.error("There are too many documents that do not have a term vector for {}", field); - throw new IllegalStateException("There are too many documents that do not have a term vector for " - + field); - } - if (numErrorDocs >= nextLogRecord) { - if (skippedErrorMessages == 0) { - log.warn("{} does not have a term vector for {}", name, field); - } else { - log.warn("{} documents do not have a term vector for {}", numErrorDocs, field); - } - nextLogRecord = bump.increment(); - skippedErrorMessages = 0; - } else { - skippedErrorMessages++; - } - } - } while (termFreqVector == null); - - // The loop exits with termFreqVector and name set. - - TermsEnum te = termFreqVector.iterator(); - BytesRef term; - TFDFMapper mapper = new TFDFMapper(indexReader.numDocs(), weight, this.terminfo); - mapper.setExpectations(field, termFreqVector.size()); - while ((term = te.next()) != null) { - mapper.map(term, (int) te.totalTermFreq()); - } - Vector result = mapper.getVector(); - if (result == null) { - // TODO is this right? last version would produce null in the iteration in this case, though it - // seems like that may not be desirable - return null; - } - - if (normPower == LuceneIterable.NO_NORMALIZING) { - result = new NamedVector(result, name); - } else { - result = new NamedVector(result.normalize(normPower), name); - } - return result; - } catch (IOException ioe) { - throw new IllegalStateException(ioe); - } - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java deleted file mode 100644 index 0b59ed6..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java +++ /dev/null @@ -1,79 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors.lucene; - -import java.io.IOException; -import java.util.Iterator; -import java.util.LinkedHashMap; -import java.util.Map; - -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.MultiFields; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.util.BytesRef; -import org.apache.mahout.utils.vectors.TermEntry; -import org.apache.mahout.utils.vectors.TermInfo; - - -/** - * Caches TermEntries from a single field. Materializes all values in the TermEnum to memory (much like FieldCache) - */ -public class CachedTermInfo implements TermInfo { - - private final Map<String, TermEntry> termEntries; - private final String field; - - public CachedTermInfo(IndexReader reader, String field, int minDf, int maxDfPercent) throws IOException { - this.field = field; - Terms t = MultiFields.getTerms(reader, field); - TermsEnum te = t.iterator(); - - int numDocs = reader.numDocs(); - double percent = numDocs * maxDfPercent / 100.0; - //Should we use a linked hash map so that we know terms are in order? - termEntries = new LinkedHashMap<>(); - int count = 0; - BytesRef text; - while ((text = te.next()) != null) { - int df = te.docFreq(); - if (df >= minDf && df <= percent) { - TermEntry entry = new TermEntry(text.utf8ToString(), count++, df); - termEntries.put(entry.getTerm(), entry); - } - } - } - - @Override - public int totalTerms(String field) { - return termEntries.size(); - } - - @Override - public TermEntry getTermEntry(String field, String term) { - if (!this.field.equals(field)) { - return null; - } - return termEntries.get(term); - } - - @Override - public Iterator<TermEntry> getAllEntries() { - return termEntries.values().iterator(); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java deleted file mode 100644 index b2568e7..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java +++ /dev/null @@ -1,381 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors.lucene; - -import java.io.File; -import java.io.IOException; -import java.io.OutputStreamWriter; -import java.io.Writer; -import java.nio.file.Paths; -import java.util.Collection; -import java.util.Collections; -import java.util.HashSet; -import java.util.LinkedHashMap; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.TreeSet; - -import com.google.common.io.Closeables; -import com.google.common.io.Files; -import org.apache.commons.cli2.CommandLine; -import org.apache.commons.cli2.Group; -import org.apache.commons.cli2.Option; -import org.apache.commons.cli2.OptionException; -import org.apache.commons.cli2.builder.ArgumentBuilder; -import org.apache.commons.cli2.builder.DefaultOptionBuilder; -import org.apache.commons.cli2.builder.GroupBuilder; -import org.apache.commons.cli2.commandline.Parser; -import org.apache.commons.io.Charsets; -import org.apache.hadoop.fs.Path; -import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.MultiFields; -import org.apache.lucene.index.PostingsEnum; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.util.Bits; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.FixedBitSet; -import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable; -import org.apache.mahout.common.CommandLineUtil; -import org.apache.mahout.common.commandline.DefaultOptionCreator; -import org.apache.mahout.math.NamedVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.stats.LogLikelihood; -import org.apache.mahout.utils.clustering.ClusterDumper; -import org.apache.mahout.utils.vectors.TermEntry; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Get labels for the cluster using Log Likelihood Ratio (LLR). - * <p/> - *"The most useful way to think of this (LLR) is as the percentage of in-cluster documents that have the - * feature (term) versus the percentage out, keeping in mind that both percentages are uncertain since we have - * only a sample of all possible documents." - Ted Dunning - * <p/> - * More about LLR can be found at : http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html - */ -public class ClusterLabels { - - private static final Logger log = LoggerFactory.getLogger(ClusterLabels.class); - - public static final int DEFAULT_MIN_IDS = 50; - public static final int DEFAULT_MAX_LABELS = 25; - - private final String indexDir; - private final String contentField; - private String idField; - private final Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints; - private String output; - private final int minNumIds; - private final int maxLabels; - - public ClusterLabels(Path seqFileDir, - Path pointsDir, - String indexDir, - String contentField, - int minNumIds, - int maxLabels) { - this.indexDir = indexDir; - this.contentField = contentField; - this.minNumIds = minNumIds; - this.maxLabels = maxLabels; - ClusterDumper clusterDumper = new ClusterDumper(seqFileDir, pointsDir); - this.clusterIdToPoints = clusterDumper.getClusterIdToPoints(); - } - - public void getLabels() throws IOException { - - try (Writer writer = (this.output == null) ? - new OutputStreamWriter(System.out, Charsets.UTF_8) : Files.newWriter(new File(this.output), Charsets.UTF_8)){ - for (Map.Entry<Integer, List<WeightedPropertyVectorWritable>> integerListEntry : clusterIdToPoints.entrySet()) { - List<WeightedPropertyVectorWritable> wpvws = integerListEntry.getValue(); - List<TermInfoClusterInOut> termInfos = getClusterLabels(integerListEntry.getKey(), wpvws); - if (termInfos != null) { - writer.write('\n'); - writer.write("Top labels for Cluster "); - writer.write(String.valueOf(integerListEntry.getKey())); - writer.write(" containing "); - writer.write(String.valueOf(wpvws.size())); - writer.write(" vectors"); - writer.write('\n'); - writer.write("Term \t\t LLR \t\t In-ClusterDF \t\t Out-ClusterDF "); - writer.write('\n'); - for (TermInfoClusterInOut termInfo : termInfos) { - writer.write(termInfo.getTerm()); - writer.write("\t\t"); - writer.write(String.valueOf(termInfo.getLogLikelihoodRatio())); - writer.write("\t\t"); - writer.write(String.valueOf(termInfo.getInClusterDF())); - writer.write("\t\t"); - writer.write(String.valueOf(termInfo.getOutClusterDF())); - writer.write('\n'); - } - } - } - } - } - - /** - * Get the list of labels, sorted by best score. - */ - protected List<TermInfoClusterInOut> getClusterLabels(Integer integer, - Collection<WeightedPropertyVectorWritable> wpvws) throws IOException { - - if (wpvws.size() < minNumIds) { - log.info("Skipping small cluster {} with size: {}", integer, wpvws.size()); - return null; - } - - log.info("Processing Cluster {} with {} documents", integer, wpvws.size()); - Directory dir = FSDirectory.open(Paths.get(this.indexDir)); - IndexReader reader = DirectoryReader.open(dir); - - - log.info("# of documents in the index {}", reader.numDocs()); - - Collection<String> idSet = new HashSet<>(); - for (WeightedPropertyVectorWritable wpvw : wpvws) { - Vector vector = wpvw.getVector(); - if (vector instanceof NamedVector) { - idSet.add(((NamedVector) vector).getName()); - } - } - - int numDocs = reader.numDocs(); - - FixedBitSet clusterDocBitset = getClusterDocBitset(reader, idSet, this.idField); - - log.info("Populating term infos from the index"); - - /** - * This code is as that of CachedTermInfo, with one major change, which is to get the document frequency. - * - * Since we have deleted the documents out of the cluster, the document frequency for a term should only - * include the in-cluster documents. The document frequency obtained from TermEnum reflects the frequency - * in the entire index. To get the in-cluster frequency, we need to query the index to get the term - * frequencies in each document. The number of results of this call will be the in-cluster document - * frequency. - */ - Terms t = MultiFields.getTerms(reader, contentField); - TermsEnum te = t.iterator(); - Map<String, TermEntry> termEntryMap = new LinkedHashMap<>(); - Bits liveDocs = MultiFields.getLiveDocs(reader); //WARNING: returns null if there are no deletions - - - int count = 0; - BytesRef term; - while ((term = te.next()) != null) { - FixedBitSet termBitset = new FixedBitSet(reader.maxDoc()); - PostingsEnum docsEnum = MultiFields.getTermDocsEnum(reader, contentField, term); - int docID; - while ((docID = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { - //check to see if we don't have an deletions (null) or if document is live - if (liveDocs != null && !liveDocs.get(docID)) { - // document is deleted... - termBitset.set(docsEnum.docID()); - } - } - // AND the term's bitset with cluster doc bitset to get the term's in-cluster frequency. - // This modifies the termBitset, but that's fine as we are not using it anywhere else. - termBitset.and(clusterDocBitset); - int inclusterDF = (int) termBitset.cardinality(); - - TermEntry entry = new TermEntry(term.utf8ToString(), count++, inclusterDF); - termEntryMap.put(entry.getTerm(), entry); - - } - - List<TermInfoClusterInOut> clusteredTermInfo = new LinkedList<>(); - - int clusterSize = wpvws.size(); - - for (TermEntry termEntry : termEntryMap.values()) { - - int corpusDF = reader.docFreq(new Term(this.contentField,termEntry.getTerm())); - int outDF = corpusDF - termEntry.getDocFreq(); - int inDF = termEntry.getDocFreq(); - double logLikelihoodRatio = scoreDocumentFrequencies(inDF, outDF, clusterSize, numDocs); - TermInfoClusterInOut termInfoCluster = - new TermInfoClusterInOut(termEntry.getTerm(), inDF, outDF, logLikelihoodRatio); - clusteredTermInfo.add(termInfoCluster); - } - - Collections.sort(clusteredTermInfo); - // Cleanup - Closeables.close(reader, true); - termEntryMap.clear(); - - return clusteredTermInfo.subList(0, Math.min(clusteredTermInfo.size(), maxLabels)); - } - - private static FixedBitSet getClusterDocBitset(IndexReader reader, - Collection<String> idSet, - String idField) throws IOException { - int numDocs = reader.numDocs(); - - FixedBitSet bitset = new FixedBitSet(numDocs); - - Set<String> idFieldSelector = null; - if (idField != null) { - idFieldSelector = new TreeSet<>(); - idFieldSelector.add(idField); - } - - - for (int i = 0; i < numDocs; i++) { - String id; - // Use Lucene's internal ID if idField is not specified. Else, get it from the document. - if (idField == null) { - id = Integer.toString(i); - } else { - id = reader.document(i, idFieldSelector).get(idField); - } - if (idSet.contains(id)) { - bitset.set(i); - } - } - log.info("Created bitset for in-cluster documents : {}", bitset.cardinality()); - return bitset; - } - - private static double scoreDocumentFrequencies(long inDF, long outDF, long clusterSize, long corpusSize) { - long k12 = clusterSize - inDF; - long k22 = corpusSize - clusterSize - outDF; - - return LogLikelihood.logLikelihoodRatio(inDF, k12, outDF, k22); - } - - public String getIdField() { - return idField; - } - - public void setIdField(String idField) { - this.idField = idField; - } - - public String getOutput() { - return output; - } - - public void setOutput(String output) { - this.output = output; - } - - public static void main(String[] args) { - - DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); - ArgumentBuilder abuilder = new ArgumentBuilder(); - GroupBuilder gbuilder = new GroupBuilder(); - - Option indexOpt = obuilder.withLongName("dir").withRequired(true).withArgument( - abuilder.withName("dir").withMinimum(1).withMaximum(1).create()) - .withDescription("The Lucene index directory").withShortName("d").create(); - - Option outputOpt = obuilder.withLongName("output").withRequired(false).withArgument( - abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription( - "The output file. If not specified, the result is printed on console.").withShortName("o").create(); - - Option fieldOpt = obuilder.withLongName("field").withRequired(true).withArgument( - abuilder.withName("field").withMinimum(1).withMaximum(1).create()) - .withDescription("The content field in the index").withShortName("f").create(); - - Option idFieldOpt = obuilder.withLongName("idField").withRequired(false).withArgument( - abuilder.withName("idField").withMinimum(1).withMaximum(1).create()).withDescription( - "The field for the document ID in the index. If null, then the Lucene internal doc " - + "id is used which is prone to error if the underlying index changes").withShortName("i").create(); - - Option seqOpt = obuilder.withLongName("seqFileDir").withRequired(true).withArgument( - abuilder.withName("seqFileDir").withMinimum(1).withMaximum(1).create()).withDescription( - "The directory containing Sequence Files for the Clusters").withShortName("s").create(); - - Option pointsOpt = obuilder.withLongName("pointsDir").withRequired(true).withArgument( - abuilder.withName("pointsDir").withMinimum(1).withMaximum(1).create()).withDescription( - "The directory containing points sequence files mapping input vectors to their cluster. ") - .withShortName("p").create(); - Option minClusterSizeOpt = obuilder.withLongName("minClusterSize").withRequired(false).withArgument( - abuilder.withName("minClusterSize").withMinimum(1).withMaximum(1).create()).withDescription( - "The minimum number of points required in a cluster to print the labels for").withShortName("m").create(); - Option maxLabelsOpt = obuilder.withLongName("maxLabels").withRequired(false).withArgument( - abuilder.withName("maxLabels").withMinimum(1).withMaximum(1).create()).withDescription( - "The maximum number of labels to print per cluster").withShortName("x").create(); - Option helpOpt = DefaultOptionCreator.helpOption(); - - Group group = gbuilder.withName("Options").withOption(indexOpt).withOption(idFieldOpt).withOption(outputOpt) - .withOption(fieldOpt).withOption(seqOpt).withOption(pointsOpt).withOption(helpOpt) - .withOption(maxLabelsOpt).withOption(minClusterSizeOpt).create(); - - try { - Parser parser = new Parser(); - parser.setGroup(group); - CommandLine cmdLine = parser.parse(args); - - if (cmdLine.hasOption(helpOpt)) { - CommandLineUtil.printHelp(group); - return; - } - - Path seqFileDir = new Path(cmdLine.getValue(seqOpt).toString()); - Path pointsDir = new Path(cmdLine.getValue(pointsOpt).toString()); - String indexDir = cmdLine.getValue(indexOpt).toString(); - String contentField = cmdLine.getValue(fieldOpt).toString(); - - String idField = null; - - if (cmdLine.hasOption(idFieldOpt)) { - idField = cmdLine.getValue(idFieldOpt).toString(); - } - String output = null; - if (cmdLine.hasOption(outputOpt)) { - output = cmdLine.getValue(outputOpt).toString(); - } - int maxLabels = DEFAULT_MAX_LABELS; - if (cmdLine.hasOption(maxLabelsOpt)) { - maxLabels = Integer.parseInt(cmdLine.getValue(maxLabelsOpt).toString()); - } - int minSize = DEFAULT_MIN_IDS; - if (cmdLine.hasOption(minClusterSizeOpt)) { - minSize = Integer.parseInt(cmdLine.getValue(minClusterSizeOpt).toString()); - } - ClusterLabels clusterLabel = new ClusterLabels(seqFileDir, pointsDir, indexDir, contentField, minSize, maxLabels); - - if (idField != null) { - clusterLabel.setIdField(idField); - } - if (output != null) { - clusterLabel.setOutput(output); - } - - clusterLabel.getLabels(); - - } catch (OptionException e) { - log.error("Exception", e); - CommandLineUtil.printHelp(group); - } catch (IOException e) { - log.error("Exception", e); - } - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java deleted file mode 100644 index 876816f..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java +++ /dev/null @@ -1,349 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * <p/> - * http://www.apache.org/licenses/LICENSE-2.0 - * <p/> - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors.lucene; - -import java.io.File; -import java.io.IOException; -import java.io.Writer; -import java.nio.file.Paths; -import java.util.Iterator; - -import com.google.common.base.Preconditions; -import com.google.common.io.Files; -import org.apache.commons.cli2.CommandLine; -import org.apache.commons.cli2.Group; -import org.apache.commons.cli2.Option; -import org.apache.commons.cli2.OptionException; -import org.apache.commons.cli2.builder.ArgumentBuilder; -import org.apache.commons.cli2.builder.DefaultOptionBuilder; -import org.apache.commons.cli2.builder.GroupBuilder; -import org.apache.commons.cli2.commandline.Parser; -import org.apache.commons.io.Charsets; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Text; -import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.FSDirectory; -import org.apache.mahout.common.CommandLineUtil; -import org.apache.mahout.math.VectorWritable; -import org.apache.mahout.utils.vectors.TermEntry; -import org.apache.mahout.utils.vectors.TermInfo; -import org.apache.mahout.utils.vectors.io.DelimitedTermInfoWriter; -import org.apache.mahout.utils.vectors.io.SequenceFileVectorWriter; -import org.apache.mahout.utils.vectors.io.VectorWriter; -import org.apache.mahout.vectorizer.TF; -import org.apache.mahout.vectorizer.TFIDF; -import org.apache.mahout.vectorizer.Weight; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public final class Driver { - - private static final Logger log = LoggerFactory.getLogger(Driver.class); - - private String luceneDir; - private String outFile; - private String field; - private String idField; - private String dictOut; - private String seqDictOut = ""; - private String weightType = "tfidf"; - private String delimiter = "\t"; - private double norm = LuceneIterable.NO_NORMALIZING; - private long maxDocs = Long.MAX_VALUE; - private int minDf = 1; - private int maxDFPercent = 99; - private double maxPercentErrorDocs = 0.0; - - public void dumpVectors() throws IOException { - - File file = new File(luceneDir); - Preconditions.checkArgument(file.isDirectory(), - "Lucene directory: " + file.getAbsolutePath() - + " does not exist or is not a directory"); - Preconditions.checkArgument(maxDocs >= 0, "maxDocs must be >= 0"); - Preconditions.checkArgument(minDf >= 1, "minDf must be >= 1"); - Preconditions.checkArgument(maxDFPercent <= 99, "maxDFPercent must be <= 99"); - - Directory dir = FSDirectory.open(Paths.get(file.getAbsolutePath())); - IndexReader reader = DirectoryReader.open(dir); - - - Weight weight; - if ("tf".equalsIgnoreCase(weightType)) { - weight = new TF(); - } else if ("tfidf".equalsIgnoreCase(weightType)) { - weight = new TFIDF(); - } else { - throw new IllegalArgumentException("Weight type " + weightType + " is not supported"); - } - - TermInfo termInfo = new CachedTermInfo(reader, field, minDf, maxDFPercent); - - LuceneIterable iterable; - if (norm == LuceneIterable.NO_NORMALIZING) { - iterable = new LuceneIterable(reader, idField, field, termInfo, weight, LuceneIterable.NO_NORMALIZING, - maxPercentErrorDocs); - } else { - iterable = new LuceneIterable(reader, idField, field, termInfo, weight, norm, maxPercentErrorDocs); - } - - log.info("Output File: {}", outFile); - - try (VectorWriter vectorWriter = getSeqFileWriter(outFile)) { - long numDocs = vectorWriter.write(iterable, maxDocs); - log.info("Wrote: {} vectors", numDocs); - } - - File dictOutFile = new File(dictOut); - log.info("Dictionary Output file: {}", dictOutFile); - Writer writer = Files.newWriter(dictOutFile, Charsets.UTF_8); - try (DelimitedTermInfoWriter tiWriter = new DelimitedTermInfoWriter(writer, delimiter, field)) { - tiWriter.write(termInfo); - } - - if (!"".equals(seqDictOut)) { - log.info("SequenceFile Dictionary Output file: {}", seqDictOut); - - Path path = new Path(seqDictOut); - Configuration conf = new Configuration(); - FileSystem fs = FileSystem.get(conf); - try (SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, path, Text.class, IntWritable.class)) { - Text term = new Text(); - IntWritable termIndex = new IntWritable(); - Iterator<TermEntry> termEntries = termInfo.getAllEntries(); - while (termEntries.hasNext()) { - TermEntry termEntry = termEntries.next(); - term.set(termEntry.getTerm()); - termIndex.set(termEntry.getTermIdx()); - seqWriter.append(term, termIndex); - } - } - } - } - - public static void main(String[] args) throws IOException { - - DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); - ArgumentBuilder abuilder = new ArgumentBuilder(); - GroupBuilder gbuilder = new GroupBuilder(); - - Option inputOpt = obuilder.withLongName("dir").withRequired(true).withArgument( - abuilder.withName("dir").withMinimum(1).withMaximum(1).create()) - .withDescription("The Lucene directory").withShortName("d").create(); - - Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument( - abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription("The output file") - .withShortName("o").create(); - - Option fieldOpt = obuilder.withLongName("field").withRequired(true).withArgument( - abuilder.withName("field").withMinimum(1).withMaximum(1).create()).withDescription( - "The field in the index").withShortName("f").create(); - - Option idFieldOpt = obuilder.withLongName("idField").withRequired(false).withArgument( - abuilder.withName("idField").withMinimum(1).withMaximum(1).create()).withDescription( - "The field in the index containing the index. If null, then the Lucene internal doc " - + "id is used which is prone to error if the underlying index changes").create(); - - Option dictOutOpt = obuilder.withLongName("dictOut").withRequired(true).withArgument( - abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create()).withDescription( - "The output of the dictionary").withShortName("t").create(); - - Option seqDictOutOpt = obuilder.withLongName("seqDictOut").withRequired(false).withArgument( - abuilder.withName("seqDictOut").withMinimum(1).withMaximum(1).create()).withDescription( - "The output of the dictionary as sequence file").withShortName("st").create(); - - Option weightOpt = obuilder.withLongName("weight").withRequired(false).withArgument( - abuilder.withName("weight").withMinimum(1).withMaximum(1).create()).withDescription( - "The kind of weight to use. Currently TF or TFIDF").withShortName("w").create(); - - Option delimiterOpt = obuilder.withLongName("delimiter").withRequired(false).withArgument( - abuilder.withName("delimiter").withMinimum(1).withMaximum(1).create()).withDescription( - "The delimiter for outputting the dictionary").withShortName("l").create(); - - Option powerOpt = obuilder.withLongName("norm").withRequired(false).withArgument( - abuilder.withName("norm").withMinimum(1).withMaximum(1).create()).withDescription( - "The norm to use, expressed as either a double or \"INF\" if you want to use the Infinite norm. " - + "Must be greater or equal to 0. The default is not to normalize").withShortName("n").create(); - - Option maxOpt = obuilder.withLongName("max").withRequired(false).withArgument( - abuilder.withName("max").withMinimum(1).withMaximum(1).create()).withDescription( - "The maximum number of vectors to output. If not specified, then it will loop over all docs") - .withShortName("m").create(); - - Option minDFOpt = obuilder.withLongName("minDF").withRequired(false).withArgument( - abuilder.withName("minDF").withMinimum(1).withMaximum(1).create()).withDescription( - "The minimum document frequency. Default is 1").withShortName("md").create(); - - Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false).withArgument( - abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create()).withDescription( - "The max percentage of docs for the DF. Can be used to remove really high frequency terms." - + " Expressed as an integer between 0 and 100. Default is 99.").withShortName("x").create(); - - Option maxPercentErrorDocsOpt = obuilder.withLongName("maxPercentErrorDocs").withRequired(false).withArgument( - abuilder.withName("maxPercentErrorDocs").withMinimum(1).withMaximum(1).create()).withDescription( - "The max percentage of docs that can have a null term vector. These are noise document and can occur if the " - + "analyzer used strips out all terms in the target field. This percentage is expressed as a value " - + "between 0 and 1. The default is 0.").withShortName("err").create(); - - Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h") - .create(); - - Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(idFieldOpt).withOption( - outputOpt).withOption(delimiterOpt).withOption(helpOpt).withOption(fieldOpt).withOption(maxOpt) - .withOption(dictOutOpt).withOption(seqDictOutOpt).withOption(powerOpt).withOption(maxDFPercentOpt) - .withOption(weightOpt).withOption(minDFOpt).withOption(maxPercentErrorDocsOpt).create(); - - try { - Parser parser = new Parser(); - parser.setGroup(group); - CommandLine cmdLine = parser.parse(args); - - if (cmdLine.hasOption(helpOpt)) { - - CommandLineUtil.printHelp(group); - return; - } - - if (cmdLine.hasOption(inputOpt)) { // Lucene case - Driver luceneDriver = new Driver(); - luceneDriver.setLuceneDir(cmdLine.getValue(inputOpt).toString()); - - if (cmdLine.hasOption(maxOpt)) { - luceneDriver.setMaxDocs(Long.parseLong(cmdLine.getValue(maxOpt).toString())); - } - - if (cmdLine.hasOption(weightOpt)) { - luceneDriver.setWeightType(cmdLine.getValue(weightOpt).toString()); - } - - luceneDriver.setField(cmdLine.getValue(fieldOpt).toString()); - - if (cmdLine.hasOption(minDFOpt)) { - luceneDriver.setMinDf(Integer.parseInt(cmdLine.getValue(minDFOpt).toString())); - } - - if (cmdLine.hasOption(maxDFPercentOpt)) { - luceneDriver.setMaxDFPercent(Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString())); - } - - if (cmdLine.hasOption(powerOpt)) { - String power = cmdLine.getValue(powerOpt).toString(); - if ("INF".equals(power)) { - luceneDriver.setNorm(Double.POSITIVE_INFINITY); - } else { - luceneDriver.setNorm(Double.parseDouble(power)); - } - } - - if (cmdLine.hasOption(idFieldOpt)) { - luceneDriver.setIdField(cmdLine.getValue(idFieldOpt).toString()); - } - - if (cmdLine.hasOption(maxPercentErrorDocsOpt)) { - luceneDriver.setMaxPercentErrorDocs(Double.parseDouble(cmdLine.getValue(maxPercentErrorDocsOpt).toString())); - } - - luceneDriver.setOutFile(cmdLine.getValue(outputOpt).toString()); - - luceneDriver.setDelimiter(cmdLine.hasOption(delimiterOpt) ? cmdLine.getValue(delimiterOpt).toString() : "\t"); - - luceneDriver.setDictOut(cmdLine.getValue(dictOutOpt).toString()); - - if (cmdLine.hasOption(seqDictOutOpt)) { - luceneDriver.setSeqDictOut(cmdLine.getValue(seqDictOutOpt).toString()); - } - - luceneDriver.dumpVectors(); - } - } catch (OptionException e) { - log.error("Exception", e); - CommandLineUtil.printHelp(group); - } - } - - private static VectorWriter getSeqFileWriter(String outFile) throws IOException { - Path path = new Path(outFile); - Configuration conf = new Configuration(); - FileSystem fs = FileSystem.get(conf); - // TODO: Make this parameter driven - - SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, path, LongWritable.class, - VectorWritable.class); - - return new SequenceFileVectorWriter(seqWriter); - } - - public void setLuceneDir(String luceneDir) { - this.luceneDir = luceneDir; - } - - public void setMaxDocs(long maxDocs) { - this.maxDocs = maxDocs; - } - - public void setWeightType(String weightType) { - this.weightType = weightType; - } - - public void setField(String field) { - this.field = field; - } - - public void setMinDf(int minDf) { - this.minDf = minDf; - } - - public void setMaxDFPercent(int maxDFPercent) { - this.maxDFPercent = maxDFPercent; - } - - public void setNorm(double norm) { - this.norm = norm; - } - - public void setIdField(String idField) { - this.idField = idField; - } - - public void setOutFile(String outFile) { - this.outFile = outFile; - } - - public void setDelimiter(String delimiter) { - this.delimiter = delimiter; - } - - public void setDictOut(String dictOut) { - this.dictOut = dictOut; - } - - public void setSeqDictOut(String seqDictOut) { - this.seqDictOut = seqDictOut; - } - - public void setMaxPercentErrorDocs(double maxPercentErrorDocs) { - this.maxPercentErrorDocs = maxPercentErrorDocs; - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java deleted file mode 100644 index 1af0ed0..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java +++ /dev/null @@ -1,80 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors.lucene; - -import org.apache.lucene.index.IndexReader; -import org.apache.mahout.math.Vector; -import org.apache.mahout.utils.vectors.TermInfo; -import org.apache.mahout.vectorizer.Weight; - -import java.util.Iterator; - -/** - * {@link Iterable} counterpart to {@link LuceneIterator}. - */ -public final class LuceneIterable implements Iterable<Vector> { - - public static final double NO_NORMALIZING = -1.0; - - private final IndexReader indexReader; - private final String field; - private final String idField; - private final TermInfo terminfo; - private final double normPower; - private final double maxPercentErrorDocs; - private final Weight weight; - - public LuceneIterable(IndexReader reader, String idField, String field, TermInfo terminfo, Weight weight) { - this(reader, idField, field, terminfo, weight, NO_NORMALIZING); - } - - public LuceneIterable(IndexReader indexReader, String idField, String field, TermInfo terminfo, Weight weight, - double normPower) { - this(indexReader, idField, field, terminfo, weight, normPower, 0); - } - - /** - * Produce a LuceneIterable that can create the Vector plus normalize it. - * - * @param indexReader {@link org.apache.lucene.index.IndexReader} to read the documents from. - * @param idField field containing the id. May be null. - * @param field field to use for the Vector - * @param normPower the normalization value. Must be nonnegative, or {@link #NO_NORMALIZING} - * @param maxPercentErrorDocs the percentage of documents in the lucene index that can have a null term vector - */ - public LuceneIterable(IndexReader indexReader, - String idField, - String field, - TermInfo terminfo, - Weight weight, - double normPower, - double maxPercentErrorDocs) { - this.indexReader = indexReader; - this.idField = idField; - this.field = field; - this.terminfo = terminfo; - this.normPower = normPower; - this.maxPercentErrorDocs = maxPercentErrorDocs; - this.weight = weight; - } - - @Override - public Iterator<Vector> iterator() { - return new LuceneIterator(indexReader, idField, field, terminfo, weight, normPower, maxPercentErrorDocs); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java deleted file mode 100644 index 6a8c659..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java +++ /dev/null @@ -1,99 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors.lucene; - -import java.io.IOException; -import java.util.Set; -import java.util.TreeSet; - -import com.google.common.base.Preconditions; -import org.apache.lucene.index.IndexReader; -import org.apache.mahout.utils.vectors.TermInfo; -import org.apache.mahout.vectorizer.Weight; - -/** - * An {@link java.util.Iterator} over {@link org.apache.mahout.math.Vector}s that uses a Lucene index as the source - * for creating the {@link org.apache.mahout.math.Vector}s. The field used to create the vectors currently must have - * term vectors stored for it. - */ -public class LuceneIterator extends AbstractLuceneIterator { - - protected final Set<String> idFieldSelector; - protected final String idField; - - /** - * Produce a LuceneIterable that can create the Vector plus normalize it. - * - * @param indexReader {@link IndexReader} to read the documents from. - * @param idField field containing the id. May be null. - * @param field field to use for the Vector - * @param termInfo termInfo - * @param weight weight - * @param normPower the normalization value. Must be non-negative, or {@link LuceneIterable#NO_NORMALIZING} - */ - public LuceneIterator(IndexReader indexReader, String idField, String field, TermInfo termInfo, Weight weight, - double normPower) { - this(indexReader, idField, field, termInfo, weight, normPower, 0.0); - } - - /** - * @param indexReader {@link IndexReader} to read the documents from. - * @param idField field containing the id. May be null. - * @param field field to use for the Vector - * @param termInfo termInfo - * @param weight weight - * @param normPower the normalization value. Must be non-negative, or {@link LuceneIterable#NO_NORMALIZING} - * @param maxPercentErrorDocs most documents that will be tolerated without a term freq vector. In [0,1]. - * @see #LuceneIterator(org.apache.lucene.index.IndexReader, String, String, org.apache.mahout.utils.vectors.TermInfo, - * org.apache.mahout.vectorizer.Weight, double) - */ - public LuceneIterator(IndexReader indexReader, - String idField, - String field, - TermInfo termInfo, - Weight weight, - double normPower, - double maxPercentErrorDocs) { - super(termInfo, normPower, indexReader, weight, maxPercentErrorDocs, field); - // term docs(null) is a better way of iterating all the docs in Lucene - Preconditions.checkArgument(normPower == LuceneIterable.NO_NORMALIZING || normPower >= 0, - "normPower must be non-negative or -1, but normPower = " + normPower); - Preconditions.checkArgument(maxPercentErrorDocs >= 0.0 && maxPercentErrorDocs <= 1.0, - "Must be: 0.0 <= maxPercentErrorDocs <= 1.0"); - this.idField = idField; - if (idField != null) { - idFieldSelector = new TreeSet<>(); - idFieldSelector.add(idField); - } else { - /*The field in the index containing the index. If null, then the Lucene internal doc id is used - which is prone to error if the underlying index changes*/ - idFieldSelector = null; - } - } - - @Override - protected String getVectorName(int documentIndex) throws IOException { - String name; - if (idField != null) { - name = indexReader.document(documentIndex, idFieldSelector).get(idField); - } else { - name = String.valueOf(documentIndex); - } - return name; - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java deleted file mode 100644 index 5830ccc..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java +++ /dev/null @@ -1,64 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors.lucene; - -import org.apache.lucene.util.BytesRef; -import org.apache.mahout.math.RandomAccessSparseVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.utils.vectors.TermEntry; -import org.apache.mahout.utils.vectors.TermInfo; -import org.apache.mahout.vectorizer.Weight; - - -/** - * Not thread-safe - */ -public class TFDFMapper { - - private Vector vector; - - private final Weight weight; - private long numTerms; - private final TermInfo termInfo; - private String field; - private final int numDocs; - - public TFDFMapper(int numDocs, Weight weight, TermInfo termInfo) { - this.weight = weight; - this.termInfo = termInfo; - this.numDocs = numDocs; - } - - public void setExpectations(String field, long numTerms) { - this.field = field; - vector = new RandomAccessSparseVector(termInfo.totalTerms(field)); - this.numTerms = numTerms; - } - - public void map(BytesRef term, int frequency) { - TermEntry entry = termInfo.getTermEntry(field, term.utf8ToString()); - if (entry != null) { - vector.setQuick(entry.getTermIdx(), weight.calculate(frequency, entry.getDocFreq(), (int)numTerms, numDocs)); - } - } - - public Vector getVector() { - return this.vector; - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TermInfoClusterInOut.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TermInfoClusterInOut.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TermInfoClusterInOut.java deleted file mode 100644 index b0311c7..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TermInfoClusterInOut.java +++ /dev/null @@ -1,81 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors.lucene; - -import org.apache.mahout.common.RandomUtils; - -class TermInfoClusterInOut implements Comparable<TermInfoClusterInOut> { - - private final String term; - private final int inClusterDF; - private final int outClusterDF; - private final double logLikelihoodRatio; - - TermInfoClusterInOut(String term, int inClusterDF, int outClusterDF, double logLikelihoodRatio) { - this.term = term; - this.inClusterDF = inClusterDF; - this.outClusterDF = outClusterDF; - this.logLikelihoodRatio = logLikelihoodRatio; - } - - @Override - public int hashCode() { - return term.hashCode() ^ inClusterDF ^ outClusterDF ^ RandomUtils.hashDouble(logLikelihoodRatio); - } - - @Override - public boolean equals(Object o) { - if (!(o instanceof TermInfoClusterInOut)) { - return false; - } - TermInfoClusterInOut other = (TermInfoClusterInOut) o; - return term.equals(other.getTerm()) - && inClusterDF == other.getInClusterDF() - && outClusterDF == other.getOutClusterDF() - && logLikelihoodRatio == other.getLogLikelihoodRatio(); - } - - @Override - public int compareTo(TermInfoClusterInOut that) { - int res = Double.compare(that.logLikelihoodRatio, logLikelihoodRatio); - if (res == 0) { - res = term.compareTo(that.term); - } - return res; - } - - public int getInClusterDiff() { - return this.inClusterDF - this.outClusterDF; - } - - String getTerm() { - return term; - } - - int getInClusterDF() { - return inClusterDF; - } - - int getOutClusterDF() { - return outClusterDF; - } - - double getLogLikelihoodRatio() { - return logLikelihoodRatio; - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarityTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarityTest.java b/integration/src/test/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarityTest.java deleted file mode 100644 index 463a45f..0000000 --- a/integration/src/test/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarityTest.java +++ /dev/null @@ -1,79 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.similarity.jdbc; - -import org.apache.mahout.cf.taste.impl.TasteTestCase; -import org.apache.mahout.cf.taste.similarity.ItemSimilarity; -import org.easymock.EasyMock; -import org.junit.Test; - -import javax.sql.DataSource; -import java.sql.Connection; -import java.sql.PreparedStatement; -import java.sql.ResultSet; - -public class MySQLJDBCInMemoryItemSimilarityTest extends TasteTestCase { - - @Test - public void testMemoryLoad() throws Exception { - - DataSource dataSource = EasyMock.createMock(DataSource.class); - Connection connection = EasyMock.createMock(Connection.class); - PreparedStatement statement = EasyMock.createMock(PreparedStatement.class); - ResultSet resultSet = EasyMock.createMock(ResultSet.class); - - EasyMock.expect(dataSource.getConnection()).andReturn(connection); - EasyMock.expect(connection.prepareStatement(MySQLJDBCInMemoryItemSimilarity.DEFAULT_GET_ALL_ITEMSIMILARITIES_SQL, - ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY)).andReturn(statement); - statement.setFetchDirection(ResultSet.FETCH_FORWARD); - EasyMock.expect(statement.executeQuery()).andReturn(resultSet); - - EasyMock.expect(resultSet.next()).andReturn(true); - - EasyMock.expect(resultSet.getLong(1)).andReturn(1L); - EasyMock.expect(resultSet.getLong(2)).andReturn(2L); - EasyMock.expect(resultSet.getDouble(3)).andReturn(0.5); - EasyMock.expect(resultSet.next()).andReturn(true); - - EasyMock.expect(resultSet.getLong(1)).andReturn(1L); - EasyMock.expect(resultSet.getLong(2)).andReturn(3L); - EasyMock.expect(resultSet.getDouble(3)).andReturn(0.4); - EasyMock.expect(resultSet.next()).andReturn(true); - - EasyMock.expect(resultSet.getLong(1)).andReturn(3L); - EasyMock.expect(resultSet.getLong(2)).andReturn(4L); - EasyMock.expect(resultSet.getDouble(3)).andReturn(0.1); - - EasyMock.expect(resultSet.next()).andReturn(false); - - resultSet.close(); - statement.close(); - connection.close(); - - EasyMock.replay(dataSource, connection, statement, resultSet); - - ItemSimilarity similarity = new MySQLJDBCInMemoryItemSimilarity(dataSource); - - assertEquals(0.5, similarity.itemSimilarity(1L, 2L), EPSILON); - assertEquals(0.4, similarity.itemSimilarity(1L, 3L), EPSILON); - assertEquals(0.1, similarity.itemSimilarity(3L, 4L), EPSILON); - assertTrue(Double.isNaN(similarity.itemSimilarity(1L, 4L))); - - EasyMock.verify(dataSource, connection, statement, resultSet); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java b/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java deleted file mode 100644 index 01d46fc..0000000 --- a/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java +++ /dev/null @@ -1,236 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.clustering; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.FieldType; -import org.apache.lucene.document.StringField; -import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.index.IndexOptions; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.store.RAMDirectory; -import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver; -import org.apache.mahout.clustering.kmeans.KMeansDriver; -import org.apache.mahout.clustering.kmeans.RandomSeedGenerator; -import org.apache.mahout.common.MahoutTestCase; -import org.apache.mahout.common.distance.DistanceMeasure; -import org.apache.mahout.common.distance.EuclideanDistanceMeasure; -import org.apache.mahout.math.NamedVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; -import org.apache.mahout.utils.clustering.ClusterDumper; -import org.apache.mahout.utils.vectors.TermEntry; -import org.apache.mahout.utils.vectors.TermInfo; -import org.apache.mahout.utils.vectors.lucene.CachedTermInfo; -import org.apache.mahout.utils.vectors.lucene.LuceneIterable; -import org.apache.mahout.vectorizer.TFIDF; -import org.apache.mahout.vectorizer.Weight; -import org.junit.Before; -import org.junit.Test; - -public final class TestClusterDumper extends MahoutTestCase { - - private static final String[] DOCS = { - "The quick red fox jumped over the lazy brown dogs.", - "The quick brown fox jumped over the lazy red dogs.", - "The quick red cat jumped over the lazy brown dogs.", - "The quick brown cat jumped over the lazy red dogs.", - "Mary had a little lamb whose fleece was white as snow.", - "Mary had a little goat whose fleece was white as snow.", - "Mary had a little lamb whose fleece was black as tar.", - "Dick had a little goat whose fleece was white as snow.", - "Moby Dick is a story of a whale and a man obsessed.", - "Moby Bob is a story of a walrus and a man obsessed.", - "Moby Dick is a story of a whale and a crazy man.", - "The robber wore a black fleece jacket and a baseball cap.", - "The robber wore a red fleece jacket and a baseball cap.", - "The robber wore a white fleece jacket and a baseball cap.", - "The English Springer Spaniel is the best of all dogs."}; - - private List<VectorWritable> sampleData; - - private String[] termDictionary; - - @Override - @Before - public void setUp() throws Exception { - super.setUp(); - Configuration conf = getConfiguration(); - FileSystem fs = FileSystem.get(conf); - // Create test data - getSampleData(DOCS); - ClusteringTestUtils.writePointsToFile(sampleData, true, - getTestTempFilePath("testdata/file1"), fs, conf); - } - - private void getSampleData(String[] docs2) throws IOException { - sampleData = new ArrayList<>(); - RAMDirectory directory = new RAMDirectory(); - try (IndexWriter writer = new IndexWriter(directory, - new IndexWriterConfig(new StandardAnalyzer()))){ - for (int i = 0; i < docs2.length; i++) { - Document doc = new Document(); - Field id = new StringField("id", "doc_" + i, Field.Store.YES); - doc.add(id); - // Store both position and offset information - FieldType fieldType = new FieldType(); - fieldType.setStored(false); - fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - fieldType.setTokenized(true); - fieldType.setStoreTermVectors(true); - fieldType.setStoreTermVectorPositions(true); - fieldType.setStoreTermVectorOffsets(true); - fieldType.freeze(); - Field text = new Field("content", docs2[i], fieldType); - doc.add(text); - writer.addDocument(doc); - } - } - - IndexReader reader = DirectoryReader.open(directory); - - Weight weight = new TFIDF(); - TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100); - - int numTerms = 0; - for (Iterator<TermEntry> it = termInfo.getAllEntries(); it.hasNext();) { - it.next(); - numTerms++; - } - termDictionary = new String[numTerms]; - int i = 0; - for (Iterator<TermEntry> it = termInfo.getAllEntries(); it.hasNext();) { - String term = it.next().getTerm(); - termDictionary[i] = term; - System.out.println(i + " " + term); - i++; - } - Iterable<Vector> iterable = new LuceneIterable(reader, "id", "content", - termInfo,weight); - - i = 0; - for (Vector vector : iterable) { - assertNotNull(vector); - NamedVector namedVector; - if (vector instanceof NamedVector) { - // rename it for testing purposes - namedVector = new NamedVector(((NamedVector) vector).getDelegate(), - "P(" + i + ')'); - - } else { - namedVector = new NamedVector(vector, "P(" + i + ')'); - } - System.out.println(AbstractCluster.formatVector(namedVector, - termDictionary)); - sampleData.add(new VectorWritable(namedVector)); - i++; - } - } - - /** - * Return the path to the final iteration's clusters - */ - private static Path finalClusterPath(Configuration conf, Path output, - int maxIterations) throws IOException { - FileSystem fs = FileSystem.get(conf); - for (int i = maxIterations; i >= 0; i--) { - Path clusters = new Path(output, "clusters-" + i + "-final"); - if (fs.exists(clusters)) { - return clusters; - } - } - return null; - } - - @Test - public void testKmeans() throws Exception { - DistanceMeasure measure = new EuclideanDistanceMeasure(); - Path input = getTestTempFilePath("input"); - Path output = getTestTempDirPath("output"); - Path initialPoints = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX); - Configuration conf = getConfiguration(); - FileSystem fs = FileSystem.get(conf); - // Write test data to file - ClusteringTestUtils.writePointsToFile(sampleData, input, fs, conf); - // Select initial centroids - RandomSeedGenerator.buildRandom(conf, input, initialPoints, 8, measure, 1L); - // Run k-means - Path kMeansOutput = new Path(output, "kmeans"); - KMeansDriver.run(conf, getTestTempDirPath("testdata"), initialPoints, kMeansOutput, 0.001, 10, true, 0.0, false); - // Print out clusters - ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, - output, 10), new Path(kMeansOutput, "clusteredPoints")); - clusterDumper.printClusters(termDictionary); - } - - @Test - public void testJsonClusterDumper() throws Exception { - DistanceMeasure measure = new EuclideanDistanceMeasure(); - Path input = getTestTempFilePath("input"); - Path output = getTestTempDirPath("output"); - Path initialPoints = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX); - Configuration conf = getConfiguration(); - FileSystem fs = FileSystem.get(conf); - // Write test data to file - ClusteringTestUtils.writePointsToFile(sampleData, input, fs, conf); - // Select initial centroids - RandomSeedGenerator.buildRandom(conf, input, initialPoints, 8, measure, 1L); - // Run k-means - Path kmeansOutput = new Path(output, "kmeans"); - KMeansDriver.run(conf, getTestTempDirPath("testdata"), initialPoints, kmeansOutput, 0.001, 10, true, 0.0, false); - // Print out clusters - ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, - output, 10), new Path(kmeansOutput, "clusteredPoints")); - clusterDumper.setOutputFormat(ClusterDumper.OUTPUT_FORMAT.JSON); - clusterDumper.printClusters(termDictionary); - } - - @Test - public void testFuzzyKmeans() throws Exception { - DistanceMeasure measure = new EuclideanDistanceMeasure(); - Path input = getTestTempFilePath("input"); - Path output = getTestTempDirPath("output"); - Path initialPoints = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX); - Configuration conf = getConfiguration(); - FileSystem fs = FileSystem.get(conf); - // Write test data to file - ClusteringTestUtils.writePointsToFile(sampleData, input, fs, conf); - // Select initial centroids - RandomSeedGenerator.buildRandom(conf, input, initialPoints, 8, measure, 1L); - // Run k-means - Path kMeansOutput = new Path(output, "kmeans"); - FuzzyKMeansDriver.run(conf, getTestTempDirPath("testdata"), initialPoints, kMeansOutput, 0.001, 10, 1.1f, true, - true, 0, true); - // run ClusterDumper - ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, - output, 10), new Path(kMeansOutput, "clusteredPoints")); - clusterDumper.printClusters(termDictionary); - } -}
