http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/regex/RegexMapper.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/regex/RegexMapper.java b/integration/src/main/java/org/apache/mahout/utils/regex/RegexMapper.java deleted file mode 100644 index 04cacaa..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/regex/RegexMapper.java +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.regex; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.regex.Pattern; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.lucene.analysis.Analyzer; -import org.apache.mahout.common.ClassUtils; - -public class RegexMapper extends Mapper<LongWritable, Text, LongWritable, Text> { - - public static final String REGEX = "regex"; - public static final String GROUP_MATCHERS = "regex.groups"; - public static final String TRANSFORMER_CLASS = "transformer.class"; - public static final String FORMATTER_CLASS = "formatter.class"; - - private Pattern regex; - private List<Integer> groupsToKeep; - private RegexTransformer transformer = RegexUtils.IDENTITY_TRANSFORMER; - private RegexFormatter formatter = RegexUtils.IDENTITY_FORMATTER; - public static final String ANALYZER_NAME = "analyzerName"; - - - @Override - protected void setup(Context context) throws IOException, InterruptedException { - groupsToKeep = new ArrayList<>(); - Configuration config = context.getConfiguration(); - String regexStr = config.get(REGEX); - regex = Pattern.compile(regexStr); - String[] groups = config.getStrings(GROUP_MATCHERS); - if (groups != null) { - for (String group : groups) { - groupsToKeep.add(Integer.parseInt(group)); - } - } - - transformer = ClassUtils.instantiateAs(config.get(TRANSFORMER_CLASS, IdentityTransformer.class.getName()), - RegexTransformer.class); - String analyzerName = config.get(ANALYZER_NAME); - if (analyzerName != null && transformer instanceof AnalyzerTransformer) { - Analyzer analyzer = ClassUtils.instantiateAs(analyzerName, Analyzer.class); - ((AnalyzerTransformer)transformer).setAnalyzer(analyzer); - } - - formatter = ClassUtils.instantiateAs(config.get(FORMATTER_CLASS, IdentityFormatter.class.getName()), - RegexFormatter.class); - } - - - @Override - protected void map(LongWritable key, Text text, Context context) throws IOException, InterruptedException { - String result = RegexUtils.extract(text.toString(), regex, groupsToKeep, " ", transformer); - if (!result.isEmpty()) { - String format = formatter.format(result); - context.write(key, new Text(format)); - } - } -}
http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/regex/RegexTransformer.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/regex/RegexTransformer.java b/integration/src/main/java/org/apache/mahout/utils/regex/RegexTransformer.java deleted file mode 100644 index adbc98f..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/regex/RegexTransformer.java +++ /dev/null @@ -1,27 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.regex; - -/** - * Transforms the match of a regular expression. - */ -public interface RegexTransformer { - - String transformMatch(String match); - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/regex/RegexUtils.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/regex/RegexUtils.java b/integration/src/main/java/org/apache/mahout/utils/regex/RegexUtils.java deleted file mode 100644 index 5e32b99..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/regex/RegexUtils.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.regex; - -import java.util.Collection; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -public final class RegexUtils { - - public static final RegexTransformer IDENTITY_TRANSFORMER = new IdentityTransformer(); - public static final RegexFormatter IDENTITY_FORMATTER = new IdentityFormatter(); - - private RegexUtils() { - } - - public static String extract(CharSequence line, Pattern pattern, Collection<Integer> groupsToKeep, - String separator, RegexTransformer transformer) { - StringBuilder bldr = new StringBuilder(); - extract(line, bldr, pattern, groupsToKeep, separator, transformer); - return bldr.toString(); - } - - public static void extract(CharSequence line, StringBuilder outputBuffer, - Pattern pattern, Collection<Integer> groupsToKeep, String separator, - RegexTransformer transformer) { - if (transformer == null) { - transformer = IDENTITY_TRANSFORMER; - } - Matcher matcher = pattern.matcher(line); - String match; - if (groupsToKeep.isEmpty()) { - while (matcher.find()) { - match = matcher.group(); - if (match != null) { - outputBuffer.append(transformer.transformMatch(match)).append(separator); - } - } - } else { - while (matcher.find()) { - for (Integer groupNum : groupsToKeep) { - match = matcher.group(groupNum); - if (match != null) { - outputBuffer.append(transformer.transformMatch(match)).append(separator); - } - } - } - } - //trim off the last separator, which is always there - if (outputBuffer.length() > 0) { - outputBuffer.setLength(outputBuffer.length() - separator.length()); - } - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/regex/URLDecodeTransformer.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/regex/URLDecodeTransformer.java b/integration/src/main/java/org/apache/mahout/utils/regex/URLDecodeTransformer.java deleted file mode 100644 index 3eb7fc0..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/regex/URLDecodeTransformer.java +++ /dev/null @@ -1,43 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.regex; - -import java.io.UnsupportedEncodingException; -import java.net.URLDecoder; - -public final class URLDecodeTransformer implements RegexTransformer { - - private final String enc; - - public URLDecodeTransformer() { - enc = "UTF-8"; - } - - public URLDecodeTransformer(String encoding) { - this.enc = encoding; - } - - @Override - public String transformMatch(String match) { - try { - return URLDecoder.decode(match, enc); - } catch (UnsupportedEncodingException e) { - throw new IllegalStateException(e); - } - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java b/integration/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java deleted file mode 100644 index 13d61b8..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java +++ /dev/null @@ -1,99 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * <p/> - * http://www.apache.org/licenses/LICENSE-2.0 - * <p/> - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors; - -import java.util.List; -import java.util.Map; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.util.ToolRunner; -import org.apache.mahout.common.AbstractJob; -import org.apache.mahout.common.Pair; -import org.apache.mahout.common.iterator.sequencefile.PathFilters; -import org.apache.mahout.common.iterator.sequencefile.PathType; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable; -import org.apache.mahout.math.VectorWritable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Converts a vector representation of documents into a {@code document x terms} matrix. - * The input data is in {@code SequenceFile<Text,VectorWritable>} format (as generated by - * {@link org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles SparseVectorsFromSequenceFiles} - * or by {@link org.apache.mahout.vectorizer.EncodedVectorsFromSequenceFiles EncodedVectorsFromSequenceFiles}) - * and generates the following two files as output: - * <ul><li>A file called "matrix" of format {@code SequenceFile<IntWritable,VectorWritable>}.</li> - * <li>A file called "docIndex" of format {@code SequenceFile<IntWritable,Text>}.</li></ul> - * The input file can be regenerated by joining the two output files on the generated int key. - * In other words, {@code RowIdJob} replaces the document text ids by integers. - * The original document text ids can still be retrieved from the "docIndex". - */ -public class RowIdJob extends AbstractJob { - private static final Logger log = LoggerFactory.getLogger(RowIdJob.class); - - @Override - public int run(String[] args) throws Exception { - - addInputOption(); - addOutputOption(); - - Map<String, List<String>> parsedArgs = parseArguments(args); - if (parsedArgs == null) { - return -1; - } - - Configuration conf = getConf(); - FileSystem fs = FileSystem.get(conf); - - Path outputPath = getOutputPath(); - Path indexPath = new Path(outputPath, "docIndex"); - Path matrixPath = new Path(outputPath, "matrix"); - - try (SequenceFile.Writer indexWriter = SequenceFile.createWriter(fs, conf, indexPath, - IntWritable.class, Text.class); - SequenceFile.Writer matrixWriter = SequenceFile.createWriter(fs, conf, matrixPath, IntWritable.class, - VectorWritable.class)) { - IntWritable docId = new IntWritable(); - int i = 0; - int numCols = 0; - for (Pair<Text, VectorWritable> record - : new SequenceFileDirIterable<Text, VectorWritable>(getInputPath(), PathType.LIST, PathFilters.logsCRCFilter(), - null, true, conf)) { - VectorWritable value = record.getSecond(); - docId.set(i); - indexWriter.append(docId, record.getFirst()); - matrixWriter.append(docId, value); - i++; - numCols = value.get().size(); - } - - log.info("Wrote out matrix with {} rows and {} columns to {}", i, numCols, matrixPath); - return 0; - } - } - - public static void main(String[] args) throws Exception { - ToolRunner.run(new RowIdJob(), args); - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java b/integration/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java deleted file mode 100644 index d74803f..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java +++ /dev/null @@ -1,46 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors; - -/** - * Each entry in a {@link TermInfo} dictionary. Contains information about a term. - */ -public class TermEntry { - - private final String term; - private final int termIdx; - private final int docFreq; - - public TermEntry(String term, int termIdx, int docFreq) { - this.term = term; - this.termIdx = termIdx; - this.docFreq = docFreq; - } - - public String getTerm() { - return term; - } - - public int getTermIdx() { - return termIdx; - } - - public int getDocFreq() { - return docFreq; - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java b/integration/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java deleted file mode 100644 index 4fb36a3..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java +++ /dev/null @@ -1,33 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors; - -import java.util.Iterator; - -/** - * Contains the term dictionary information associated with a vectorized collection of text documents - * - */ -public interface TermInfo { - - int totalTerms(String field); - - TermEntry getTermEntry(String field, String term); - - Iterator<TermEntry> getAllEntries(); -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java b/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java deleted file mode 100644 index e1c3fbc..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java +++ /dev/null @@ -1,266 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * <p/> - * http://www.apache.org/licenses/LICENSE-2.0 - * <p/> - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors; - -import com.google.common.collect.Sets; -import com.google.common.io.Closeables; -import com.google.common.io.Files; -import org.apache.commons.io.Charsets; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.FileUtil; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.util.ToolRunner; -import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable; -import org.apache.mahout.common.AbstractJob; -import org.apache.mahout.common.Pair; -import org.apache.mahout.common.iterator.sequencefile.PathFilters; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable; -import org.apache.mahout.math.NamedVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.File; -import java.io.IOException; -import java.io.OutputStreamWriter; -import java.io.Writer; -import java.util.Iterator; -import java.util.Set; - -/** - * Can read in a {@link org.apache.hadoop.io.SequenceFile} of {@link Vector}s and dump - * out the results using {@link Vector#asFormatString()} to either the console or to a - * file. - */ -public final class VectorDumper extends AbstractJob { - - private static final Logger log = LoggerFactory.getLogger(VectorDumper.class); - - private VectorDumper() { - } - - @Override - public int run(String[] args) throws Exception { - /** - Option seqOpt = obuilder.withLongName("seqFile").withRequired(false).withArgument( - abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).withDescription( - "The Sequence File containing the Vectors").withShortName("s").create(); - Option dirOpt = obuilder.withLongName("seqDirectory").withRequired(false).withArgument( - abuilder.withName("seqDirectory").withMinimum(1).withMaximum(1).create()) - .withDescription("The directory containing Sequence File of Vectors") - .withShortName("d").create(); - */ - addInputOption(); - addOutputOption(); - addOption("useKey", "u", "If the Key is a vector than dump that instead"); - addOption("printKey", "p", "Print out the key as well, delimited by tab (or the value if useKey is true"); - addOption("dictionary", "d", "The dictionary file.", false); - addOption("dictionaryType", "dt", "The dictionary file type (text|seqfile)", false); - addOption("csv", "c", "Output the Vector as CSV. Otherwise it substitutes in the terms for vector cell entries"); - addOption("namesAsComments", "n", "If using CSV output, optionally add a comment line for each NamedVector " - + "(if the vector is one) printing out the name"); - addOption("nameOnly", "N", "Use the name as the value for each NamedVector (skip other vectors)"); - addOption("sortVectors", "sort", "Sort output key/value pairs of the vector entries in abs magnitude " - + "descending order"); - addOption("quiet", "q", "Print only file contents"); - addOption("sizeOnly", "sz", "Dump only the size of the vector"); - addOption("numItems", "ni", "Output at most <n> vecors", false); - addOption("vectorSize", "vs", "Truncate vectors to <vs> length when dumping (most useful when in" - + " conjunction with -sort", false); - addOption(buildOption("filter", "fi", "Only dump out those vectors whose name matches the filter." - + " Multiple items may be specified by repeating the argument.", true, 1, Integer.MAX_VALUE, false, null)); - - if (parseArguments(args, false, true) == null) { - return -1; - } - - Path[] pathArr; - Configuration conf = new Configuration(); - FileSystem fs = FileSystem.get(conf); - Path input = getInputPath(); - FileStatus fileStatus = fs.getFileStatus(input); - if (fileStatus.isDir()) { - pathArr = FileUtil.stat2Paths(fs.listStatus(input, PathFilters.logsCRCFilter())); - } else { - FileStatus[] inputPaths = fs.globStatus(input); - pathArr = new Path[inputPaths.length]; - int i = 0; - for (FileStatus fstatus : inputPaths) { - pathArr[i++] = fstatus.getPath(); - } - } - - - String dictionaryType = getOption("dictionaryType", "text"); - - boolean sortVectors = hasOption("sortVectors"); - boolean quiet = hasOption("quiet"); - if (!quiet) { - log.info("Sort? {}", sortVectors); - } - - String[] dictionary = null; - if (hasOption("dictionary")) { - String dictFile = getOption("dictionary"); - switch (dictionaryType) { - case "text": - dictionary = VectorHelper.loadTermDictionary(new File(dictFile)); - break; - case "sequencefile": - dictionary = VectorHelper.loadTermDictionary(conf, dictFile); - break; - default: - //TODO: support Lucene's FST as a dictionary type - throw new IOException("Invalid dictionary type: " + dictionaryType); - } - } - - Set<String> filters; - if (hasOption("filter")) { - filters = Sets.newHashSet(getOptions("filter")); - } else { - filters = null; - } - - boolean useCSV = hasOption("csv"); - - boolean sizeOnly = hasOption("sizeOnly"); - boolean nameOnly = hasOption("nameOnly"); - boolean namesAsComments = hasOption("namesAsComments"); - boolean transposeKeyValue = hasOption("vectorAsKey"); - Writer writer; - boolean shouldClose; - File output = getOutputFile(); - if (output != null) { - shouldClose = true; - log.info("Output file: {}", output); - Files.createParentDirs(output); - writer = Files.newWriter(output, Charsets.UTF_8); - } else { - shouldClose = false; - writer = new OutputStreamWriter(System.out, Charsets.UTF_8); - } - try { - boolean printKey = hasOption("printKey"); - if (useCSV && dictionary != null) { - writer.write("#"); - for (int j = 0; j < dictionary.length; j++) { - writer.write(dictionary[j]); - if (j < dictionary.length - 1) { - writer.write(','); - } - } - writer.write('\n'); - } - Long numItems = null; - if (hasOption("numItems")) { - numItems = Long.parseLong(getOption("numItems")); - if (quiet) { - writer.append("#Max Items to dump: ").append(String.valueOf(numItems)).append('\n'); - } - } - int maxIndexesPerVector = hasOption("vectorSize") - ? Integer.parseInt(getOption("vectorSize")) - : Integer.MAX_VALUE; - long itemCount = 0; - int fileCount = 0; - for (Path path : pathArr) { - if (numItems != null && numItems <= itemCount) { - break; - } - if (quiet) { - log.info("Processing file '{}' ({}/{})", path, ++fileCount, pathArr.length); - } - SequenceFileIterable<Writable, Writable> iterable = new SequenceFileIterable<>(path, true, conf); - Iterator<Pair<Writable, Writable>> iterator = iterable.iterator(); - long i = 0; - while (iterator.hasNext() && (numItems == null || itemCount < numItems)) { - Pair<Writable, Writable> record = iterator.next(); - Writable keyWritable = record.getFirst(); - Writable valueWritable = record.getSecond(); - if (printKey) { - Writable notTheVectorWritable = transposeKeyValue ? valueWritable : keyWritable; - writer.write(notTheVectorWritable.toString()); - writer.write('\t'); - } - Vector vector; - try { - vector = ((VectorWritable) - (transposeKeyValue ? keyWritable : valueWritable)).get(); - } catch (ClassCastException e) { - if ((transposeKeyValue ? keyWritable : valueWritable) - instanceof WeightedPropertyVectorWritable) { - vector = - ((WeightedPropertyVectorWritable) - (transposeKeyValue ? keyWritable : valueWritable)).getVector(); - } else { - throw e; - } - } - if (filters == null - || !(vector instanceof NamedVector) - || filters.contains(((NamedVector) vector).getName())) { - if (sizeOnly) { - if (vector instanceof NamedVector) { - writer.write(((NamedVector) vector).getName()); - writer.write(":"); - } else { - writer.write(String.valueOf(i++)); - writer.write(":"); - } - writer.write(String.valueOf(vector.size())); - writer.write('\n'); - } else if (nameOnly) { - if (vector instanceof NamedVector) { - writer.write(((NamedVector) vector).getName()); - writer.write('\n'); - } - } else { - String fmtStr; - if (useCSV) { - fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments); - } else { - fmtStr = VectorHelper.vectorToJson(vector, dictionary, maxIndexesPerVector, - sortVectors); - } - writer.write(fmtStr); - writer.write('\n'); - } - itemCount++; - } - } - } - writer.flush(); - } finally { - if (shouldClose) { - Closeables.close(writer, false); - } - } - - return 0; - } - - public static void main(String[] args) throws Exception { - ToolRunner.run(new Configuration(), new VectorDumper(), args); - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java b/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java deleted file mode 100644 index 66c3fb6..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java +++ /dev/null @@ -1,256 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors; - -import com.google.common.base.Function; -import com.google.common.collect.Collections2; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.Text; -import org.apache.lucene.util.PriorityQueue; -import org.apache.mahout.common.Pair; -import org.apache.mahout.common.iterator.FileLineIterator; -import org.apache.mahout.common.iterator.sequencefile.PathType; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable; -import org.apache.mahout.math.NamedVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.Vector.Element; -import org.apache.mahout.math.map.OpenObjectIntHashMap; - -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.Comparator; -import java.util.Iterator; -import java.util.List; -import java.util.regex.Pattern; - -/** Static utility methods related to vectors. */ -public final class VectorHelper { - - private static final Pattern TAB_PATTERN = Pattern.compile("\t"); - - - private VectorHelper() { - } - - public static String vectorToCSVString(Vector vector, boolean namesAsComments) throws IOException { - Appendable bldr = new StringBuilder(2048); - vectorToCSVString(vector, namesAsComments, bldr); - return bldr.toString(); - } - - public static String buildJson(Iterable<Pair<String, Double>> iterable) { - return buildJson(iterable, new StringBuilder(2048)); - } - - public static String buildJson(Iterable<Pair<String, Double>> iterable, StringBuilder bldr) { - bldr.append('{'); - for (Pair<String, Double> p : iterable) { - bldr.append(p.getFirst()); - bldr.append(':'); - bldr.append(p.getSecond()); - bldr.append(','); - } - if (bldr.length() > 1) { - bldr.setCharAt(bldr.length() - 1, '}'); - } - return bldr.toString(); - } - - public static List<Pair<Integer, Double>> topEntries(Vector vector, int maxEntries) { - - // Get the size of nonZero elements in the input vector - int sizeOfNonZeroElementsInVector = vector.getNumNonZeroElements(); - - // If the sizeOfNonZeroElementsInVector < maxEntries then set maxEntries = sizeOfNonZeroElementsInVector - // otherwise the call to queue.pop() returns a Pair(null, null) and the subsequent call - // to pair.getFirst() throws a NullPointerException - if (sizeOfNonZeroElementsInVector < maxEntries) { - maxEntries = sizeOfNonZeroElementsInVector; - } - - PriorityQueue<Pair<Integer, Double>> queue = new TDoublePQ<>(-1, maxEntries); - for (Element e : vector.nonZeroes()) { - queue.insertWithOverflow(Pair.of(e.index(), e.get())); - } - List<Pair<Integer, Double>> entries = new ArrayList<>(); - Pair<Integer, Double> pair; - while ((pair = queue.pop()) != null) { - if (pair.getFirst() > -1) { - entries.add(pair); - } - } - Collections.sort(entries, new Comparator<Pair<Integer, Double>>() { - @Override - public int compare(Pair<Integer, Double> a, Pair<Integer, Double> b) { - return b.getSecond().compareTo(a.getSecond()); - } - }); - return entries; - } - - public static List<Pair<Integer, Double>> firstEntries(Vector vector, int maxEntries) { - List<Pair<Integer, Double>> entries = new ArrayList<>(); - Iterator<Vector.Element> it = vector.nonZeroes().iterator(); - int i = 0; - while (it.hasNext() && i++ < maxEntries) { - Vector.Element e = it.next(); - entries.add(Pair.of(e.index(), e.get())); - } - return entries; - } - - public static List<Pair<String, Double>> toWeightedTerms(Collection<Pair<Integer, Double>> entries, - final String[] dictionary) { - if (dictionary != null) { - return new ArrayList<>(Collections2.transform(entries, - new Function<Pair<Integer, Double>, Pair<String, Double>>() { - @Override - public Pair<String, Double> apply(Pair<Integer, Double> p) { - return Pair.of(dictionary[p.getFirst()], p.getSecond()); - } - })); - } else { - return new ArrayList<>(Collections2.transform(entries, - new Function<Pair<Integer, Double>, Pair<String, Double>>() { - @Override - public Pair<String, Double> apply(Pair<Integer, Double> p) { - return Pair.of(Integer.toString(p.getFirst()), p.getSecond()); - } - })); - } - } - - public static String vectorToJson(Vector vector, String[] dictionary, int maxEntries, boolean sort) { - return buildJson(toWeightedTerms(sort - ? topEntries(vector, maxEntries) - : firstEntries(vector, maxEntries), dictionary)); - } - - public static void vectorToCSVString(Vector vector, - boolean namesAsComments, - Appendable bldr) throws IOException { - if (namesAsComments && vector instanceof NamedVector) { - bldr.append('#').append(((NamedVector) vector).getName()).append('\n'); - } - Iterator<Vector.Element> iter = vector.all().iterator(); - boolean first = true; - while (iter.hasNext()) { - if (first) { - first = false; - } else { - bldr.append(','); - } - Vector.Element elt = iter.next(); - bldr.append(String.valueOf(elt.get())); - } - bldr.append('\n'); - } - - /** - * Read in a dictionary file. Format is: - * <p/> - * <pre> - * term DocFreq Index - * </pre> - */ - public static String[] loadTermDictionary(File dictFile) throws IOException { - try (InputStream in = new FileInputStream(dictFile)) { - return loadTermDictionary(in); - } - } - - /** - * Read a dictionary in {@link org.apache.hadoop.io.SequenceFile} generated by - * {@link org.apache.mahout.vectorizer.DictionaryVectorizer} - * - * @param filePattern <PATH TO DICTIONARY>/dictionary.file-* - */ - public static String[] loadTermDictionary(Configuration conf, String filePattern) { - OpenObjectIntHashMap<String> dict = new OpenObjectIntHashMap<>(); - int maxIndexValue = 0; - for (Pair<Text, IntWritable> record - : new SequenceFileDirIterable<Text, IntWritable>(new Path(filePattern), PathType.GLOB, null, null, true, - conf)) { - dict.put(record.getFirst().toString(), record.getSecond().get()); - if (record.getSecond().get() > maxIndexValue) { - maxIndexValue = record.getSecond().get(); - } - } - // Set dictionary size to greater of (maxIndexValue + 1, dict.size()) - int maxDictionarySize = maxIndexValue + 1 > dict.size() ? maxIndexValue + 1 : dict.size(); - String[] dictionary = new String[maxDictionarySize]; - for (String feature : dict.keys()) { - dictionary[dict.get(feature)] = feature; - } - return dictionary; - } - - /** - * Read in a dictionary file. Format is: First line is the number of entries - * <p/> - * <pre> - * term DocFreq Index - * </pre> - */ - private static String[] loadTermDictionary(InputStream is) throws IOException { - FileLineIterator it = new FileLineIterator(is); - - int numEntries = Integer.parseInt(it.next()); - String[] result = new String[numEntries]; - - while (it.hasNext()) { - String line = it.next(); - if (line.startsWith("#")) { - continue; - } - String[] tokens = TAB_PATTERN.split(line); - if (tokens.length < 3) { - continue; - } - int index = Integer.parseInt(tokens[2]); // tokens[1] is the doc freq - result[index] = tokens[0]; - } - return result; - } - - private static final class TDoublePQ<T> extends PriorityQueue<Pair<T, Double>> { - private final T sentinel; - - private TDoublePQ(T sentinel, int size) { - super(size); - this.sentinel = sentinel; - } - - @Override - protected boolean lessThan(Pair<T, Double> a, Pair<T, Double> b) { - return a.getSecond().compareTo(b.getSecond()) < 0; - } - - @Override - protected Pair<T, Double> getSentinelObject() { - return Pair.of(sentinel, Double.NEGATIVE_INFINITY); - } - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java b/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java deleted file mode 100644 index f2632a4..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java +++ /dev/null @@ -1,144 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors.arff; - -import java.io.BufferedReader; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import com.google.common.collect.AbstractIterator; -import com.google.common.io.Closeables; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.RandomAccessSparseVector; -import org.apache.mahout.math.Vector; - -final class ARFFIterator extends AbstractIterator<Vector> { - - // This pattern will make sure a , inside a string is not a point for split. - // Ex: "Arizona" , "0:08 PM, PDT" , 110 will be split considering "0:08 PM, PDT" as one string - private static final Pattern WORDS_WITHOUT_SPARSE = Pattern.compile("([\\w[^{]])*"); - private static final Pattern DATA_PATTERN = Pattern.compile("^\\"+ARFFModel.ARFF_SPARSE+"(.*)\\"+ARFFModel.ARFF_SPARSE_END+"$"); - - private final BufferedReader reader; - private final ARFFModel model; - - ARFFIterator(BufferedReader reader, ARFFModel model) { - this.reader = reader; - this.model = model; - } - - @Override - protected Vector computeNext() { - String line; - try { - while ((line = reader.readLine()) != null) { - line = line.trim(); - if (!line.isEmpty() && !line.startsWith(ARFFModel.ARFF_COMMENT)) { - break; - } - } - } catch (IOException ioe) { - throw new IllegalStateException(ioe); - } - if (line == null) { - try { - Closeables.close(reader, true); - } catch (IOException e) { - throw new IllegalStateException(e); - } - return endOfData(); - } - Vector result; - Matcher contents = DATA_PATTERN.matcher(line); - if (contents.find()) { - line = contents.group(1); - String[] splits = splitCSV(line); - result = new RandomAccessSparseVector(model.getLabelSize()); - for (String split : splits) { - int idIndex = split.indexOf(' '); - int idx = Integer.parseInt(split.substring(0, idIndex).trim()); - String data = split.substring(idIndex).trim(); - if (!"?".equals(data)) { - result.setQuick(idx, model.getValue(data, idx)); - } - } - } else { - result = new DenseVector(model.getLabelSize()); - String[] splits = splitCSV(line); - for (int i = 0; i < splits.length; i++) { - String split = splits[i]; - split = split.trim(); - if (WORDS_WITHOUT_SPARSE.matcher(split).matches() && !"?".equals(split)) { - result.setQuick(i, model.getValue(split, i)); - } - } - } - return result; - } - - /** - * Splits a string by comma, ignores commas inside quotes and escaped quotes. - * As quotes are both double and single possible, because there is no exact definition - * for ARFF files - * @param line - - * @return String[] - */ - public static String[] splitCSV(String line) { - StringBuilder sb = new StringBuilder(128); - List<String> tokens = new ArrayList<>(); - char escapeChar = '\0'; - for (int i = 0; i < line.length(); i++) { - char c = line.charAt(i); - if (c == '\\') { - i++; - sb.append(line.charAt(i)); - } - else if (c == '"' || c == '\'') { - // token is closed - if (c == escapeChar) { - escapeChar = '\0'; - } - else if (escapeChar == '\0') { - escapeChar = c; - } - sb.append(c); - } - else if (c == ',') { - if (escapeChar == '\0') { - tokens.add(sb.toString().trim()); - sb.setLength(0); // start work on next token - } - else { - sb.append(c); - } - } - else { - sb.append(c); - } - } - if (sb.length() > 0) { - tokens.add(sb.toString().trim()); - } - - return tokens.toArray(new String[tokens.size()]); - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java b/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java deleted file mode 100644 index fc86997..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java +++ /dev/null @@ -1,76 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors.arff; - -import java.text.DateFormat; -import java.util.Map; - -/** - * An interface for representing an ARFFModel. Implementations can decide on the best approach - * for storing the model, as some approaches will be fine for smaller files, while larger - * ones may require a better implementation. - */ -public interface ARFFModel { - String ARFF_SPARSE = "{"; //indicates the vector is sparse - String ARFF_SPARSE_END = "}"; - String ARFF_COMMENT = "%"; - String ATTRIBUTE = "@attribute"; - String DATA = "@data"; - String RELATION = "@relation"; - - - String getRelation(); - - void setRelation(String relation); - - /** - * The vector attributes (labels in Mahout speak) - * @return the map - */ - Map<String, Integer> getLabelBindings(); - - Integer getNominalValue(String label, String nominal); - - void addNominal(String label, String nominal, int idx); - - DateFormat getDateFormat(Integer idx); - - void addDateFormat(Integer idx, DateFormat format); - - Integer getLabelIndex(String label); - - void addLabel(String label, Integer idx); - - ARFFType getARFFType(Integer idx); - - void addType(Integer idx, ARFFType type); - - /** - * The count of the number of words seen - * @return the count - */ - long getWordCount(); - - double getValue(String data, int idx); - - Map<String, Map<String, Integer>> getNominalMap(); - - int getLabelSize(); - - Map<String, Long> getWords(); -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java b/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java deleted file mode 100644 index 9ba7c31..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java +++ /dev/null @@ -1,62 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors.arff; - -public enum ARFFType { - - NUMERIC("numeric"), - INTEGER("integer"), - REAL("real"), - NOMINAL("{"), - DATE("date"), - STRING("string"); - - private final String indicator; - - ARFFType(String indicator) { - this.indicator = indicator; - } - - public String getIndicator() { - return indicator; - } - - public String getLabel(String line) { - int idx = line.lastIndexOf(indicator); - return removeQuotes(line.substring(ARFFModel.ATTRIBUTE.length(), idx)); - } - - /** - * Remove quotes and leading/trailing whitespace from a single or double quoted string - * @param str quotes from - * @return A string without quotes - */ - public static String removeQuotes(String str) { - String cleaned = str; - if (cleaned != null) { - cleaned = cleaned.trim(); - boolean isQuoted = cleaned.length() > 1 - && (cleaned.startsWith("\"") && cleaned.endsWith("\"") - || cleaned.startsWith("'") && cleaned.endsWith("'")); - if (isQuoted) { - cleaned = cleaned.substring(1, cleaned.length() - 1); - } - } - return cleaned; - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java b/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java deleted file mode 100644 index 180a1e1..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java +++ /dev/null @@ -1,155 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors.arff; - -import java.io.BufferedReader; -import java.io.File; -import java.io.IOException; -import java.io.Reader; -import java.io.StringReader; -import java.nio.charset.Charset; -import java.text.DateFormat; -import java.text.SimpleDateFormat; -import java.util.Iterator; -import java.util.Locale; - -import com.google.common.io.Files; -import org.apache.commons.io.Charsets; -import org.apache.mahout.math.Vector; - -/** - * Read in ARFF (http://www.cs.waikato.ac.nz/~ml/weka/arff.html) and create {@link Vector}s - * <p/> - * Attribute type handling: - * <ul> - * <li>Numeric -> As is</li> - * <li>Nominal -> ordinal(value) i.e. @attribute lumber {'\'(-inf-0.5]\'','\'(0.5-inf)\''} - * will convert -inf-0.5 -> 0, and 0.5-inf -> 1</li> - * <li>Dates -> Convert to time as a long</li> - * <li>Strings -> Create a map of String -> long</li> - * </ul> - * NOTE: This class does not set the label bindings on every vector. If you want the label - * bindings, call {@link MapBackedARFFModel#getLabelBindings()}, as they are the same for every vector. - */ -public class ARFFVectorIterable implements Iterable<Vector> { - - private final BufferedReader buff; - private final ARFFModel model; - - public ARFFVectorIterable(File file, ARFFModel model) throws IOException { - this(file, Charsets.UTF_8, model); - } - - public ARFFVectorIterable(File file, Charset encoding, ARFFModel model) throws IOException { - this(Files.newReader(file, encoding), model); - } - - public ARFFVectorIterable(String arff, ARFFModel model) throws IOException { - this(new StringReader(arff), model); - } - - public ARFFVectorIterable(Reader reader, ARFFModel model) throws IOException { - if (reader instanceof BufferedReader) { - buff = (BufferedReader) reader; - } else { - buff = new BufferedReader(reader); - } - //grab the attributes, then start the iterator at the first line of data - this.model = model; - - int labelNumber = 0; - String line; - while ((line = buff.readLine()) != null) { - line = line.trim(); - if (!line.startsWith(ARFFModel.ARFF_COMMENT) && !line.isEmpty()) { - Integer labelNumInt = labelNumber; - String[] lineParts = line.split("[\\s\\t]+", 2); - - // is it a relation name? - if (lineParts[0].equalsIgnoreCase(ARFFModel.RELATION)) { - model.setRelation(ARFFType.removeQuotes(lineParts[1])); - } - // or an attribute - else if (lineParts[0].equalsIgnoreCase(ARFFModel.ATTRIBUTE)) { - String label; - ARFFType type; - - // split the name of the attribute and its description - String[] attrParts = lineParts[1].split("[\\s\\t]+", 2); - if (attrParts.length < 2) - throw new UnsupportedOperationException("No type for attribute found: " + lineParts[1]); - - // label is attribute name - label = ARFFType.removeQuotes(attrParts[0].toLowerCase()); - if (attrParts[1].equalsIgnoreCase(ARFFType.NUMERIC.getIndicator())) { - type = ARFFType.NUMERIC; - } else if (attrParts[1].equalsIgnoreCase(ARFFType.INTEGER.getIndicator())) { - type = ARFFType.INTEGER; - } else if (attrParts[1].equalsIgnoreCase(ARFFType.REAL.getIndicator())) { - type = ARFFType.REAL; - } else if (attrParts[1].equalsIgnoreCase(ARFFType.STRING.getIndicator())) { - type = ARFFType.STRING; - } else if (attrParts[1].toLowerCase().startsWith(ARFFType.NOMINAL.getIndicator())) { - type = ARFFType.NOMINAL; - // nominal example: - // @ATTRIBUTE class {Iris-setosa,'Iris versicolor',Iris-virginica} - String[] classes = ARFFIterator.splitCSV(attrParts[1].substring(1, attrParts[1].length() - 1)); - for (int i = 0; i < classes.length; i++) { - model.addNominal(label, ARFFType.removeQuotes(classes[i]), i + 1); - } - } else if (attrParts[1].toLowerCase().startsWith(ARFFType.DATE.getIndicator())) { - type = ARFFType.DATE; - //TODO: DateFormatter map - DateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH); - String formStr = attrParts[1].substring(ARFFType.DATE.getIndicator().length()).trim(); - if (!formStr.isEmpty()) { - if (formStr.startsWith("\"")) { - formStr = formStr.substring(1, formStr.length() - 1); - } - format = new SimpleDateFormat(formStr, Locale.ENGLISH); - } - model.addDateFormat(labelNumInt, format); - //@attribute <name> date [<date-format>] - } else { - throw new UnsupportedOperationException("Invalid attribute: " + attrParts[1]); - } - model.addLabel(label, labelNumInt); - model.addType(labelNumInt, type); - labelNumber++; - } else if (lineParts[0].equalsIgnoreCase(ARFFModel.DATA)) { - break; //skip it - } - } - } - - } - - @Override - public Iterator<Vector> iterator() { - return new ARFFIterator(buff, model); - } - - /** - * Returns info about the ARFF content that was parsed. - * - * @return the model - */ - public ARFFModel getModel() { - return model; - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java b/integration/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java deleted file mode 100644 index ccecbb1..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java +++ /dev/null @@ -1,263 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * <p/> - * http://www.apache.org/licenses/LICENSE-2.0 - * <p/> - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors.arff; - -import java.io.File; -import java.io.FilenameFilter; -import java.io.IOException; -import java.io.Writer; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Set; - -import com.google.common.io.Files; -import org.apache.commons.cli2.CommandLine; -import org.apache.commons.cli2.Group; -import org.apache.commons.cli2.Option; -import org.apache.commons.cli2.OptionException; -import org.apache.commons.cli2.builder.ArgumentBuilder; -import org.apache.commons.cli2.builder.DefaultOptionBuilder; -import org.apache.commons.cli2.builder.GroupBuilder; -import org.apache.commons.cli2.commandline.Parser; -import org.apache.commons.io.Charsets; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.SequenceFile; -import org.apache.mahout.common.CommandLineUtil; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; -import org.apache.mahout.utils.vectors.io.SequenceFileVectorWriter; -import org.apache.mahout.utils.vectors.io.VectorWriter; -import org.codehaus.jackson.map.ObjectMapper; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public final class Driver { - - private static final Logger log = LoggerFactory.getLogger(Driver.class); - - /** used for JSON serialization/deserialization */ - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - - private Driver() { - } - - public static void main(String[] args) throws IOException { - DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); - ArgumentBuilder abuilder = new ArgumentBuilder(); - GroupBuilder gbuilder = new GroupBuilder(); - - Option inputOpt = obuilder - .withLongName("input") - .withRequired(true) - .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create()) - .withDescription( - "The file or directory containing the ARFF files. If it is a directory, all .arff files will be converted") - .withShortName("d").create(); - - Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument( - abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription( - "The output directory. Files will have the same name as the input, but with the extension .mvc") - .withShortName("o").create(); - - Option maxOpt = obuilder.withLongName("max").withRequired(false).withArgument( - abuilder.withName("max").withMinimum(1).withMaximum(1).create()).withDescription( - "The maximum number of vectors to output. If not specified, then it will loop over all docs") - .withShortName("m").create(); - - Option dictOutOpt = obuilder.withLongName("dictOut").withRequired(true).withArgument( - abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create()).withDescription( - "The file to output the label bindings").withShortName("t").create(); - - Option jsonDictonaryOpt = obuilder.withLongName("json-dictonary").withRequired(false) - .withDescription("Write dictonary in JSON format").withShortName("j").create(); - - Option delimiterOpt = obuilder.withLongName("delimiter").withRequired(false).withArgument( - abuilder.withName("delimiter").withMinimum(1).withMaximum(1).create()).withDescription( - "The delimiter for outputing the dictionary").withShortName("l").create(); - - Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h") - .create(); - Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(maxOpt) - .withOption(helpOpt).withOption(dictOutOpt).withOption(jsonDictonaryOpt).withOption(delimiterOpt) - .create(); - - try { - Parser parser = new Parser(); - parser.setGroup(group); - CommandLine cmdLine = parser.parse(args); - - if (cmdLine.hasOption(helpOpt)) { - - CommandLineUtil.printHelp(group); - return; - } - if (cmdLine.hasOption(inputOpt)) { // Lucene case - File input = new File(cmdLine.getValue(inputOpt).toString()); - long maxDocs = Long.MAX_VALUE; - if (cmdLine.hasOption(maxOpt)) { - maxDocs = Long.parseLong(cmdLine.getValue(maxOpt).toString()); - } - if (maxDocs < 0) { - throw new IllegalArgumentException("maxDocs must be >= 0"); - } - String outDir = cmdLine.getValue(outputOpt).toString(); - log.info("Output Dir: {}", outDir); - - String delimiter = cmdLine.hasOption(delimiterOpt) ? cmdLine.getValue(delimiterOpt).toString() : "\t"; - File dictOut = new File(cmdLine.getValue(dictOutOpt).toString()); - boolean jsonDictonary = cmdLine.hasOption(jsonDictonaryOpt); - ARFFModel model = new MapBackedARFFModel(); - if (input.exists() && input.isDirectory()) { - File[] files = input.listFiles(new FilenameFilter() { - @Override - public boolean accept(File file, String name) { - return name.endsWith(".arff"); - } - }); - - for (File file : files) { - writeFile(outDir, file, maxDocs, model, dictOut, delimiter, jsonDictonary); - } - } else { - writeFile(outDir, input, maxDocs, model, dictOut, delimiter, jsonDictonary); - } - } - - } catch (OptionException e) { - log.error("Exception", e); - CommandLineUtil.printHelp(group); - } - } - - protected static void writeLabelBindings(File dictOut, ARFFModel arffModel, String delimiter, boolean jsonDictonary) - throws IOException { - try (Writer writer = Files.newWriterSupplier(dictOut, Charsets.UTF_8, true).getOutput()) { - if (jsonDictonary) { - writeLabelBindingsJSON(writer, arffModel); - } else { - writeLabelBindings(writer, arffModel, delimiter); - } - } - } - - protected static void writeLabelBindingsJSON(Writer writer, ARFFModel arffModel) throws IOException { - - // Turn the map of labels into a list order by order of appearance - List<Entry<String, Integer>> attributes = new ArrayList<>(); - attributes.addAll(arffModel.getLabelBindings().entrySet()); - Collections.sort(attributes, new Comparator<Map.Entry<String, Integer>>() { - @Override - public int compare(Entry<String, Integer> t, Entry<String, Integer> t1) { - return t.getValue().compareTo(t1.getValue()); - } - }); - - // write a map for each object - List<Map<String, Object>> jsonObjects = new LinkedList<>(); - for (int i = 0; i < attributes.size(); i++) { - - Entry<String, Integer> modelRepresentation = attributes.get(i); - Map<String, Object> jsonRepresentation = new HashMap<>(); - jsonObjects.add(jsonRepresentation); - // the last one is the class label - jsonRepresentation.put("label", i < (attributes.size() - 1) ? String.valueOf(false) : String.valueOf(true)); - String attribute = modelRepresentation.getKey(); - jsonRepresentation.put("attribute", attribute); - Map<String, Integer> nominalValues = arffModel.getNominalMap().get(attribute); - - if (nominalValues != null) { - String[] values = nominalValues.keySet().toArray(new String[1]); - - jsonRepresentation.put("values", values); - jsonRepresentation.put("type", "categorical"); - } else { - jsonRepresentation.put("type", "numerical"); - } - } - writer.write(OBJECT_MAPPER.writeValueAsString(jsonObjects)); - } - - protected static void writeLabelBindings(Writer writer, ARFFModel arffModel, String delimiter) throws IOException { - - Map<String, Integer> labels = arffModel.getLabelBindings(); - writer.write("Label bindings for Relation " + arffModel.getRelation() + '\n'); - for (Map.Entry<String, Integer> entry : labels.entrySet()) { - writer.write(entry.getKey()); - writer.write(delimiter); - writer.write(String.valueOf(entry.getValue())); - writer.write('\n'); - } - writer.write('\n'); - writer.write("Values for nominal attributes\n"); - // emit allowed values for NOMINAL/categorical/enumerated attributes - Map<String, Map<String, Integer>> nominalMap = arffModel.getNominalMap(); - // how many nominal attributes - writer.write(String.valueOf(nominalMap.size()) + "\n"); - - for (Entry<String, Map<String, Integer>> entry : nominalMap.entrySet()) { - // the label of this attribute - writer.write(entry.getKey() + "\n"); - Set<Entry<String, Integer>> attributeValues = entry.getValue().entrySet(); - // how many values does this attribute have - writer.write(attributeValues.size() + "\n"); - for (Map.Entry<String, Integer> value : attributeValues) { - // the value and the value index - writer.write(String.format("%s%s%s\n", value.getKey(), delimiter, value.getValue().toString())); - } - } - } - - protected static void writeFile(String outDir, - File file, - long maxDocs, - ARFFModel arffModel, - File dictOut, - String delimiter, - boolean jsonDictonary) throws IOException { - log.info("Converting File: {}", file); - ARFFModel model = new MapBackedARFFModel(arffModel.getWords(), arffModel.getWordCount() + 1, arffModel - .getNominalMap()); - Iterable<Vector> iteratable = new ARFFVectorIterable(file, model); - String outFile = outDir + '/' + file.getName() + ".mvc"; - - try (VectorWriter vectorWriter = getSeqFileWriter(outFile)) { - long numDocs = vectorWriter.write(iteratable, maxDocs); - writeLabelBindings(dictOut, model, delimiter, jsonDictonary); - log.info("Wrote: {} vectors", numDocs); - } - } - - private static VectorWriter getSeqFileWriter(String outFile) throws IOException { - Path path = new Path(outFile); - Configuration conf = new Configuration(); - FileSystem fs = FileSystem.get(conf); - SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, path, LongWritable.class, - VectorWritable.class); - return new SequenceFileVectorWriter(seqWriter); - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java b/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java deleted file mode 100644 index e911b1a..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java +++ /dev/null @@ -1,282 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors.arff; - -import java.text.DateFormat; -import java.text.NumberFormat; -import java.text.ParseException; -import java.text.ParsePosition; -import java.text.SimpleDateFormat; -import java.util.Collections; -import java.util.Date; -import java.util.HashMap; -import java.util.Locale; -import java.util.Map; -import java.util.regex.Pattern; - -/** - * Holds ARFF information in {@link Map}. - */ -public class MapBackedARFFModel implements ARFFModel { - - private static final Pattern QUOTE_PATTERN = Pattern.compile("\""); - - private long wordCount = 1; - - private String relation; - - private final Map<String,Integer> labelBindings; - private final Map<Integer,String> idxLabel; - private final Map<Integer,ARFFType> typeMap; // key is the vector index, value is the type - private final Map<Integer,DateFormat> dateMap; - private final Map<String,Map<String,Integer>> nominalMap; - private final Map<String,Long> words; - - public MapBackedARFFModel() { - this(new HashMap<String,Long>(), 1, new HashMap<String,Map<String,Integer>>()); - } - - public MapBackedARFFModel(Map<String,Long> words, long wordCount, Map<String,Map<String,Integer>> nominalMap) { - this.words = words; - this.wordCount = wordCount; - labelBindings = new HashMap<>(); - idxLabel = new HashMap<>(); - typeMap = new HashMap<>(); - dateMap = new HashMap<>(); - this.nominalMap = nominalMap; - - } - - @Override - public String getRelation() { - return relation; - } - - @Override - public void setRelation(String relation) { - this.relation = relation; - } - - /** - * Convert a piece of String data at a specific spot into a value - * - * @param data - * The data to convert - * @param idx - * The position in the ARFF data - * @return A double representing the data - */ - @Override - public double getValue(String data, int idx) { - ARFFType type = typeMap.get(idx); - if (type == null) { - throw new IllegalArgumentException("Attribute type cannot be NULL, attribute index was: " + idx); - } - data = QUOTE_PATTERN.matcher(data).replaceAll(""); - data = data.trim(); - double result; - switch (type) { - case NUMERIC: - case INTEGER: - case REAL: - result = processNumeric(data); - break; - case DATE: - result = processDate(data, idx); - break; - case STRING: - // may have quotes - result = processString(data); - break; - case NOMINAL: - String label = idxLabel.get(idx); - result = processNominal(label, data); - break; - default: - throw new IllegalStateException("Unknown type: " + type); - } - return result; - } - - protected double processNominal(String label, String data) { - double result; - Map<String,Integer> classes = nominalMap.get(label); - if (classes != null) { - Integer ord = classes.get(ARFFType.removeQuotes(data)); - if (ord != null) { - result = ord; - } else { - throw new IllegalStateException("Invalid nominal: " + data + " for label: " + label); - } - } else { - throw new IllegalArgumentException("Invalid nominal label: " + label + " Data: " + data); - } - - return result; - } - - // Not sure how scalable this is going to be - protected double processString(String data) { - data = QUOTE_PATTERN.matcher(data).replaceAll(""); - // map it to an long - Long theLong = words.get(data); - if (theLong == null) { - theLong = wordCount++; - words.put(data, theLong); - } - return theLong; - } - - protected static double processNumeric(String data) { - if (isNumeric(data)) { - return Double.parseDouble(data); - } - return Double.NaN; - } - - public static boolean isNumeric(String str) { - NumberFormat formatter = NumberFormat.getInstance(); - ParsePosition parsePosition = new ParsePosition(0); - formatter.parse(str, parsePosition); - return str.length() == parsePosition.getIndex(); - } - - protected double processDate(String data, int idx) { - DateFormat format = dateMap.get(idx); - if (format == null) { - format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH); - } - double result; - try { - Date date = format.parse(data); - result = date.getTime(); // hmmm, what kind of loss casting long to double? - } catch (ParseException e) { - throw new IllegalArgumentException(e); - } - return result; - } - - /** - * The vector attributes (labels in Mahout speak), unmodifiable - * - * @return the map - */ - @Override - public Map<String,Integer> getLabelBindings() { - return Collections.unmodifiableMap(labelBindings); - } - - /** - * The map of types encountered - * - * @return the map - */ - public Map<Integer,ARFFType> getTypeMap() { - return Collections.unmodifiableMap(typeMap); - } - - /** - * Map of Date formatters used - * - * @return the map - */ - public Map<Integer,DateFormat> getDateMap() { - return Collections.unmodifiableMap(dateMap); - } - - /** - * Map nominals to ids. Should only be modified by calling {@link ARFFModel#addNominal(String, String, int)} - * - * @return the map - */ - @Override - public Map<String,Map<String,Integer>> getNominalMap() { - return nominalMap; - } - - /** - * Immutable map of words to the long id used for those words - * - * @return The map - */ - @Override - public Map<String,Long> getWords() { - return words; - } - - @Override - public Integer getNominalValue(String label, String nominal) { - return nominalMap.get(label).get(nominal); - } - - @Override - public void addNominal(String label, String nominal, int idx) { - Map<String,Integer> noms = nominalMap.get(label); - if (noms == null) { - noms = new HashMap<>(); - nominalMap.put(label, noms); - } - noms.put(nominal, idx); - } - - @Override - public DateFormat getDateFormat(Integer idx) { - return dateMap.get(idx); - } - - @Override - public void addDateFormat(Integer idx, DateFormat format) { - dateMap.put(idx, format); - } - - @Override - public Integer getLabelIndex(String label) { - return labelBindings.get(label); - } - - @Override - public void addLabel(String label, Integer idx) { - labelBindings.put(label, idx); - idxLabel.put(idx, label); - } - - @Override - public ARFFType getARFFType(Integer idx) { - return typeMap.get(idx); - } - - @Override - public void addType(Integer idx, ARFFType type) { - typeMap.put(idx, type); - } - - /** - * The count of the number of words seen - * - * @return the count - */ - @Override - public long getWordCount() { - return wordCount; - } - - @Override - public int getLabelSize() { - return labelBindings.size(); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java b/integration/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java deleted file mode 100644 index 3c583fd..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java +++ /dev/null @@ -1,69 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors.csv; - -import java.io.IOException; -import java.io.Reader; - -import com.google.common.collect.AbstractIterator; -import org.apache.commons.csv.CSVParser; -import org.apache.commons.csv.CSVStrategy; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.Vector; - -/** - * Iterates a CSV file and produces {@link org.apache.mahout.math.Vector}. - * <br/> - * The Iterator returned throws {@link UnsupportedOperationException} for the {@link java.util.Iterator#remove()} - * method. - * <p/> - * Assumes DenseVector for now, but in the future may have the option of mapping columns to sparse format - * <p/> - * The Iterator is not thread-safe. - */ -public class CSVVectorIterator extends AbstractIterator<Vector> { - - private final CSVParser parser; - - public CSVVectorIterator(Reader reader) { - parser = new CSVParser(reader); - } - - public CSVVectorIterator(Reader reader, CSVStrategy strategy) { - parser = new CSVParser(reader, strategy); - } - - @Override - protected Vector computeNext() { - String[] line; - try { - line = parser.getLine(); - } catch (IOException e) { - throw new IllegalStateException(e); - } - if (line == null) { - return endOfData(); - } - Vector result = new DenseVector(line.length); - for (int i = 0; i < line.length; i++) { - result.setQuick(i, Double.parseDouble(line[i])); - } - return result; - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/io/DelimitedTermInfoWriter.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/io/DelimitedTermInfoWriter.java b/integration/src/main/java/org/apache/mahout/utils/vectors/io/DelimitedTermInfoWriter.java deleted file mode 100644 index b5f9f2b..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/io/DelimitedTermInfoWriter.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors.io; - -import java.io.IOException; -import java.io.Writer; -import java.util.Iterator; - -import com.google.common.io.Closeables; -import org.apache.mahout.utils.vectors.TermEntry; -import org.apache.mahout.utils.vectors.TermInfo; - -/** - * Write {@link TermInfo} to a {@link Writer} in a textual, delimited format with header. - */ -public class DelimitedTermInfoWriter implements TermInfoWriter { - - private final Writer writer; - private final String delimiter; - private final String field; - - public DelimitedTermInfoWriter(Writer writer, String delimiter, String field) { - this.writer = writer; - this.delimiter = delimiter; - this.field = field; - } - - @Override - public void write(TermInfo ti) throws IOException { - - Iterator<TermEntry> entIter = ti.getAllEntries(); - try { - writer.write(String.valueOf(ti.totalTerms(field))); - writer.write('\n'); - writer.write("#term" + delimiter + "doc freq" + delimiter + "idx"); - writer.write('\n'); - while (entIter.hasNext()) { - TermEntry entry = entIter.next(); - writer.write(entry.getTerm()); - writer.write(delimiter); - writer.write(String.valueOf(entry.getDocFreq())); - writer.write(delimiter); - writer.write(String.valueOf(entry.getTermIdx())); - writer.write('\n'); - } - } finally { - Closeables.close(writer, false); - } - } - - /** - * Does NOT close the underlying writer - */ - @Override - public void close() { - - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java b/integration/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java deleted file mode 100644 index 0d763a1..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java +++ /dev/null @@ -1,75 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors.io; - -import java.io.IOException; - -import com.google.common.io.Closeables; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.SequenceFile; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; - - -/** - * Writes out Vectors to a SequenceFile. - * - * Closes the writer when done - */ -public class SequenceFileVectorWriter implements VectorWriter { - private final SequenceFile.Writer writer; - private long recNum = 0; - public SequenceFileVectorWriter(SequenceFile.Writer writer) { - this.writer = writer; - } - - @Override - public long write(Iterable<Vector> iterable, long maxDocs) throws IOException { - - for (Vector point : iterable) { - if (recNum >= maxDocs) { - break; - } - if (point != null) { - writer.append(new LongWritable(recNum++), new VectorWritable(point)); - } - - } - return recNum; - } - - @Override - public void write(Vector vector) throws IOException { - writer.append(new LongWritable(recNum++), new VectorWritable(vector)); - - } - - @Override - public long write(Iterable<Vector> iterable) throws IOException { - return write(iterable, Long.MAX_VALUE); - } - - @Override - public void close() throws IOException { - Closeables.close(writer, false); - } - - public SequenceFile.Writer getWriter() { - return writer; - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/utils/vectors/io/TermInfoWriter.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/io/TermInfoWriter.java b/integration/src/main/java/org/apache/mahout/utils/vectors/io/TermInfoWriter.java deleted file mode 100644 index e165b45..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/io/TermInfoWriter.java +++ /dev/null @@ -1,29 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors.io; - -import java.io.Closeable; -import java.io.IOException; - -import org.apache.mahout.utils.vectors.TermInfo; - -public interface TermInfoWriter extends Closeable { - - void write(TermInfo ti) throws IOException; - -}
