http://git-wip-us.apache.org/repos/asf/mahout/blob/85f9ece6/integration/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java b/integration/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java index 2dcc8b0..e01868a 100644 --- a/integration/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java +++ b/integration/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java @@ -17,14 +17,15 @@ package org.apache.mahout.utils; -import com.google.common.base.Charsets; -import com.google.common.collect.Lists; -import com.google.common.io.Closeables; -import com.google.common.io.Files; import java.io.File; import java.io.OutputStreamWriter; import java.io.Writer; +import java.util.ArrayList; import java.util.List; + +import com.google.common.io.Closeables; +import com.google.common.io.Files; +import org.apache.commons.io.Charsets; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; @@ -136,7 +137,7 @@ public final class SequenceFileDumper extends AbstractJob { } } if (facets != null) { - List<String> keyList = Lists.newArrayListWithCapacity(facets.size()); + List<String> keyList = new ArrayList<>(facets.size()); IntArrayList valueList = new IntArrayList(facets.size()); facets.pairsSortedByKey(keyList, valueList);
http://git-wip-us.apache.org/repos/asf/mahout/blob/85f9ece6/integration/src/main/java/org/apache/mahout/utils/SplitInput.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/SplitInput.java b/integration/src/main/java/org/apache/mahout/utils/SplitInput.java index 562e7df..6178f80 100644 --- a/integration/src/main/java/org/apache/mahout/utils/SplitInput.java +++ b/integration/src/main/java/org/apache/mahout/utils/SplitInput.java @@ -17,10 +17,17 @@ package org.apache.mahout.utils; -import com.google.common.base.Charsets; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.Writer; +import java.nio.charset.Charset; +import java.util.BitSet; + import com.google.common.base.Preconditions; -import com.google.common.io.Closeables; import org.apache.commons.cli2.OptionException; +import org.apache.commons.io.Charsets; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; @@ -40,14 +47,6 @@ import org.apache.mahout.math.jet.random.sampling.RandomSampler; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.OutputStreamWriter; -import java.io.Writer; -import java.nio.charset.Charset; -import java.util.BitSet; - /** * A utility for splitting files in the input format used by the Bayes * classifiers or anything else that has one item per line or SequenceFiles (key/value) @@ -379,12 +378,9 @@ public class SplitInput extends AbstractJob { int trainCount = 0; int testCount = 0; if (!useSequence) { - BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(inputFile), charset)); - Writer trainingWriter = new OutputStreamWriter(fs.create(trainingOutputFile), charset); - Writer testWriter = new OutputStreamWriter(fs.create(testOutputFile), charset); - - - try { + try (BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(inputFile), charset)); + Writer trainingWriter = new OutputStreamWriter(fs.create(trainingOutputFile), charset); + Writer testWriter = new OutputStreamWriter(fs.create(testOutputFile), charset)){ String line; int pos = 0; @@ -412,19 +408,14 @@ public class SplitInput extends AbstractJob { writer.write('\n'); } - } finally { - Closeables.close(reader, true); - Closeables.close(trainingWriter, false); - Closeables.close(testWriter, false); } } else { - SequenceFileIterator<Writable, Writable> iterator = - new SequenceFileIterator<>(inputFile, false, fs.getConf()); - SequenceFile.Writer trainingWriter = SequenceFile.createWriter(fs, fs.getConf(), trainingOutputFile, - iterator.getKeyClass(), iterator.getValueClass()); - SequenceFile.Writer testWriter = SequenceFile.createWriter(fs, fs.getConf(), testOutputFile, - iterator.getKeyClass(), iterator.getValueClass()); - try { + try (SequenceFileIterator<Writable, Writable> iterator = + new SequenceFileIterator<>(inputFile, false, fs.getConf()); + SequenceFile.Writer trainingWriter = SequenceFile.createWriter(fs, fs.getConf(), trainingOutputFile, + iterator.getKeyClass(), iterator.getValueClass()); + SequenceFile.Writer testWriter = SequenceFile.createWriter(fs, fs.getConf(), testOutputFile, + iterator.getKeyClass(), iterator.getValueClass())) { int pos = 0; while (iterator.hasNext()) { @@ -450,10 +441,6 @@ public class SplitInput extends AbstractJob { writer.append(pair.getFirst(), pair.getSecond()); } - } finally { - Closeables.close(iterator, true); - Closeables.close(trainingWriter, false); - Closeables.close(testWriter, false); } } log.info("file: {}, input: {} train: {}, test: {} starting at {}", @@ -668,15 +655,11 @@ public class SplitInput extends AbstractJob { */ public static int countLines(FileSystem fs, Path inputFile, Charset charset) throws IOException { int lineCount = 0; - BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(inputFile), charset)); - try { + try (BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(inputFile), charset))){ while (reader.readLine() != null) { lineCount++; } - } finally { - Closeables.close(reader, true); } - return lineCount; } http://git-wip-us.apache.org/repos/asf/mahout/blob/85f9ece6/integration/src/main/java/org/apache/mahout/utils/SplitInputJob.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/SplitInputJob.java b/integration/src/main/java/org/apache/mahout/utils/SplitInputJob.java index 825f7a5..4a1ff86 100644 --- a/integration/src/main/java/org/apache/mahout/utils/SplitInputJob.java +++ b/integration/src/main/java/org/apache/mahout/utils/SplitInputJob.java @@ -63,6 +63,7 @@ public final class SplitInputJob { * training sets respectively * * @param initialConf + * Initial configuration * @param inputPath * path to input data SequenceFile * @param outputPath http://git-wip-us.apache.org/repos/asf/mahout/blob/85f9ece6/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java b/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java index 1856888..75b5ded 100644 --- a/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java +++ b/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java @@ -21,11 +21,15 @@ import java.io.File; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Writer; +import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; +import java.util.TreeMap; -import com.google.common.collect.Maps; +import com.google.common.io.Closeables; +import com.google.common.io.Files; +import org.apache.commons.io.Charsets; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -49,11 +53,6 @@ import org.apache.mahout.utils.vectors.VectorHelper; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.google.common.base.Charsets; -import com.google.common.collect.Lists; -import com.google.common.io.Closeables; -import com.google.common.io.Files; - public final class ClusterDumper extends AbstractJob { public static final String SAMPLE_POINTS = "samplePoints"; @@ -304,9 +303,10 @@ public final class ClusterDumper extends AbstractJob { this.maxPointsPerCluster = maxPointsPerCluster; } - public static Map<Integer, List<WeightedPropertyVectorWritable>> readPoints(Path pointsPathDir, long maxPointsPerCluster, - Configuration conf) { - Map<Integer, List<WeightedPropertyVectorWritable>> result = Maps.newTreeMap(); + public static Map<Integer, List<WeightedPropertyVectorWritable>> readPoints(Path pointsPathDir, + long maxPointsPerCluster, + Configuration conf) { + Map<Integer, List<WeightedPropertyVectorWritable>> result = new TreeMap<>(); for (Pair<IntWritable, WeightedPropertyVectorWritable> record : new SequenceFileDirIterable<IntWritable, WeightedPropertyVectorWritable>(pointsPathDir, PathType.LIST, PathFilters.logsCRCFilter(), conf)) { @@ -316,7 +316,7 @@ public final class ClusterDumper extends AbstractJob { int keyValue = record.getFirst().get(); List<WeightedPropertyVectorWritable> pointList = result.get(keyValue); if (pointList == null) { - pointList = Lists.newArrayList(); + pointList = new ArrayList<>(); result.put(keyValue, pointList); } if (pointList.size() < maxPointsPerCluster) { http://git-wip-us.apache.org/repos/asf/mahout/blob/85f9ece6/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java b/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java index f51e768..964c8cc 100644 --- a/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java +++ b/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java @@ -25,7 +25,7 @@ import java.nio.CharBuffer; import java.nio.charset.CharsetEncoder; import java.nio.charset.CodingErrorAction; -import com.google.common.base.Charsets; +import org.apache.commons.io.Charsets; import org.apache.hadoop.util.bloom.Filter; import org.apache.hadoop.util.bloom.Key; import org.apache.lucene.analysis.TokenFilter; http://git-wip-us.apache.org/repos/asf/mahout/blob/85f9ece6/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java b/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java index a7f0e67..36b166a 100644 --- a/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java +++ b/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java @@ -17,7 +17,9 @@ package org.apache.mahout.utils.regex; -import com.google.common.io.Closeables; +import java.io.IOException; +import java.io.StringReader; + import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; @@ -27,9 +29,6 @@ import org.apache.mahout.common.lucene.TokenStreamIterator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; -import java.io.StringReader; - public class AnalyzerTransformer implements RegexTransformer { private Analyzer analyzer; @@ -53,9 +52,7 @@ public class AnalyzerTransformer implements RegexTransformer { @Override public String transformMatch(String match) { StringBuilder result = new StringBuilder(); - TokenStream ts = null; - try { - ts = analyzer.tokenStream(fieldName, new StringReader(match)); + try (TokenStream ts = analyzer.tokenStream(fieldName, new StringReader(match))) { ts.addAttribute(CharTermAttribute.class); ts.reset(); TokenStreamIterator iter = new TokenStreamIterator(ts); @@ -65,12 +62,6 @@ public class AnalyzerTransformer implements RegexTransformer { ts.end(); } catch (IOException e) { throw new IllegalStateException(e); - } finally { - try { - Closeables.close(ts, true); - } catch (IOException e) { - log.error(e.getMessage(), e); - } } return result.toString(); } http://git-wip-us.apache.org/repos/asf/mahout/blob/85f9ece6/integration/src/main/java/org/apache/mahout/utils/regex/RegexMapper.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/regex/RegexMapper.java b/integration/src/main/java/org/apache/mahout/utils/regex/RegexMapper.java index a744928..04cacaa 100644 --- a/integration/src/main/java/org/apache/mahout/utils/regex/RegexMapper.java +++ b/integration/src/main/java/org/apache/mahout/utils/regex/RegexMapper.java @@ -17,7 +17,11 @@ package org.apache.mahout.utils.regex; -import com.google.common.collect.Lists; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; @@ -25,10 +29,6 @@ import org.apache.hadoop.mapreduce.Mapper; import org.apache.lucene.analysis.Analyzer; import org.apache.mahout.common.ClassUtils; -import java.io.IOException; -import java.util.List; -import java.util.regex.Pattern; - public class RegexMapper extends Mapper<LongWritable, Text, LongWritable, Text> { public static final String REGEX = "regex"; @@ -45,7 +45,7 @@ public class RegexMapper extends Mapper<LongWritable, Text, LongWritable, Text> @Override protected void setup(Context context) throws IOException, InterruptedException { - groupsToKeep = Lists.newArrayList(); + groupsToKeep = new ArrayList<>(); Configuration config = context.getConfiguration(); String regexStr = config.get(REGEX); regex = Pattern.compile(regexStr); @@ -72,7 +72,7 @@ public class RegexMapper extends Mapper<LongWritable, Text, LongWritable, Text> @Override protected void map(LongWritable key, Text text, Context context) throws IOException, InterruptedException { String result = RegexUtils.extract(text.toString(), regex, groupsToKeep, " ", transformer); - if (result != null && !result.isEmpty()) { + if (!result.isEmpty()) { String format = formatter.format(result); context.write(key, new Text(format)); } http://git-wip-us.apache.org/repos/asf/mahout/blob/85f9ece6/integration/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java b/integration/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java index 0304306..13d61b8 100644 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java +++ b/integration/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java @@ -5,9 +5,9 @@ * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -20,7 +20,6 @@ package org.apache.mahout.utils.vectors; import java.util.List; import java.util.Map; -import com.google.common.io.Closeables; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -58,7 +57,7 @@ public class RowIdJob extends AbstractJob { addInputOption(); addOutputOption(); - Map<String,List<String>> parsedArgs = parseArguments(args); + Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } @@ -69,23 +68,17 @@ public class RowIdJob extends AbstractJob { Path outputPath = getOutputPath(); Path indexPath = new Path(outputPath, "docIndex"); Path matrixPath = new Path(outputPath, "matrix"); - SequenceFile.Writer indexWriter = SequenceFile.createWriter(fs, - conf, - indexPath, - IntWritable.class, - Text.class); - SequenceFile.Writer matrixWriter = SequenceFile.createWriter(fs, - conf, - matrixPath, - IntWritable.class, - VectorWritable.class); - try { + + try (SequenceFile.Writer indexWriter = SequenceFile.createWriter(fs, conf, indexPath, + IntWritable.class, Text.class); + SequenceFile.Writer matrixWriter = SequenceFile.createWriter(fs, conf, matrixPath, IntWritable.class, + VectorWritable.class)) { IntWritable docId = new IntWritable(); int i = 0; int numCols = 0; - for (Pair<Text,VectorWritable> record - : new SequenceFileDirIterable<Text,VectorWritable>(getInputPath(), PathType.LIST, PathFilters.logsCRCFilter(), - null, true, conf)) { + for (Pair<Text, VectorWritable> record + : new SequenceFileDirIterable<Text, VectorWritable>(getInputPath(), PathType.LIST, PathFilters.logsCRCFilter(), + null, true, conf)) { VectorWritable value = record.getSecond(); docId.set(i); indexWriter.append(docId, record.getFirst()); @@ -96,9 +89,6 @@ public class RowIdJob extends AbstractJob { log.info("Wrote out matrix with {} rows and {} columns to {}", i, numCols, matrixPath); return 0; - } finally { - Closeables.close(indexWriter, false); - Closeables.close(matrixWriter, false); } } http://git-wip-us.apache.org/repos/asf/mahout/blob/85f9ece6/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java b/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java index 9214434..93ad0d5 100644 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java +++ b/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java @@ -21,13 +21,13 @@ import java.io.File; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Writer; +import java.util.HashSet; import java.util.Iterator; import java.util.Set; -import com.google.common.base.Charsets; -import com.google.common.collect.Sets; import com.google.common.io.Closeables; import com.google.common.io.Files; +import org.apache.commons.io.Charsets; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; @@ -132,7 +132,7 @@ public final class VectorDumper extends AbstractJob { Set<String> filters; if (hasOption("filter")) { - filters = Sets.newHashSet(getOptions("filter")); + filters = new HashSet<>(getOptions("filter")); } else { filters = null; } http://git-wip-us.apache.org/repos/asf/mahout/blob/85f9ece6/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java b/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java index 29b02aa..66c3fb6 100644 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java +++ b/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java @@ -19,8 +19,6 @@ package org.apache.mahout.utils.vectors; import com.google.common.base.Function; import com.google.common.collect.Collections2; -import com.google.common.collect.Iterables; -import com.google.common.collect.Lists; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; @@ -39,11 +37,12 @@ import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; +import java.util.ArrayList; import java.util.Collection; import java.util.Collections; +import java.util.Comparator; import java.util.Iterator; import java.util.List; -import java.util.Comparator; import java.util.regex.Pattern; /** Static utility methods related to vectors. */ @@ -82,7 +81,7 @@ public final class VectorHelper { public static List<Pair<Integer, Double>> topEntries(Vector vector, int maxEntries) { // Get the size of nonZero elements in the input vector - int sizeOfNonZeroElementsInVector = Iterables.size(vector.nonZeroes()); + int sizeOfNonZeroElementsInVector = vector.getNumNonZeroElements(); // If the sizeOfNonZeroElementsInVector < maxEntries then set maxEntries = sizeOfNonZeroElementsInVector // otherwise the call to queue.pop() returns a Pair(null, null) and the subsequent call @@ -95,7 +94,7 @@ public final class VectorHelper { for (Element e : vector.nonZeroes()) { queue.insertWithOverflow(Pair.of(e.index(), e.get())); } - List<Pair<Integer, Double>> entries = Lists.newArrayList(); + List<Pair<Integer, Double>> entries = new ArrayList<>(); Pair<Integer, Double> pair; while ((pair = queue.pop()) != null) { if (pair.getFirst() > -1) { @@ -112,7 +111,7 @@ public final class VectorHelper { } public static List<Pair<Integer, Double>> firstEntries(Vector vector, int maxEntries) { - List<Pair<Integer, Double>> entries = Lists.newArrayList(); + List<Pair<Integer, Double>> entries = new ArrayList<>(); Iterator<Vector.Element> it = vector.nonZeroes().iterator(); int i = 0; while (it.hasNext() && i++ < maxEntries) { @@ -125,7 +124,7 @@ public final class VectorHelper { public static List<Pair<String, Double>> toWeightedTerms(Collection<Pair<Integer, Double>> entries, final String[] dictionary) { if (dictionary != null) { - return Lists.newArrayList(Collections2.transform(entries, + return new ArrayList<>(Collections2.transform(entries, new Function<Pair<Integer, Double>, Pair<String, Double>>() { @Override public Pair<String, Double> apply(Pair<Integer, Double> p) { @@ -133,7 +132,7 @@ public final class VectorHelper { } })); } else { - return Lists.newArrayList(Collections2.transform(entries, + return new ArrayList<>(Collections2.transform(entries, new Function<Pair<Integer, Double>, Pair<String, Double>>() { @Override public Pair<String, Double> apply(Pair<Integer, Double> p) { http://git-wip-us.apache.org/repos/asf/mahout/blob/85f9ece6/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java b/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java index bf5b58b..f2632a4 100644 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java +++ b/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java @@ -19,12 +19,12 @@ package org.apache.mahout.utils.vectors.arff; import java.io.BufferedReader; import java.io.IOException; +import java.util.ArrayList; +import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; -import java.util.List; import com.google.common.collect.AbstractIterator; -import com.google.common.collect.Lists; import com.google.common.io.Closeables; import org.apache.mahout.math.DenseVector; import org.apache.mahout.math.RandomAccessSparseVector; @@ -103,7 +103,7 @@ final class ARFFIterator extends AbstractIterator<Vector> { */ public static String[] splitCSV(String line) { StringBuilder sb = new StringBuilder(128); - List<String> tokens = Lists.newArrayList(); + List<String> tokens = new ArrayList<>(); char escapeChar = '\0'; for (int i = 0; i < line.length(); i++) { char c = line.charAt(i); http://git-wip-us.apache.org/repos/asf/mahout/blob/85f9ece6/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java b/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java index c005005..fc86997 100644 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java +++ b/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java @@ -20,7 +20,6 @@ package org.apache.mahout.utils.vectors.arff; import java.text.DateFormat; import java.util.Map; - /** * An interface for representing an ARFFModel. Implementations can decide on the best approach * for storing the model, as some approaches will be fine for smaller files, while larger http://git-wip-us.apache.org/repos/asf/mahout/blob/85f9ece6/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java b/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java index 72b840f..180a1e1 100644 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java +++ b/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java @@ -17,10 +17,6 @@ package org.apache.mahout.utils.vectors.arff; -import com.google.common.base.Charsets; -import com.google.common.io.Files; -import org.apache.mahout.math.Vector; - import java.io.BufferedReader; import java.io.File; import java.io.IOException; @@ -32,6 +28,10 @@ import java.text.SimpleDateFormat; import java.util.Iterator; import java.util.Locale; +import com.google.common.io.Files; +import org.apache.commons.io.Charsets; +import org.apache.mahout.math.Vector; + /** * Read in ARFF (http://www.cs.waikato.ac.nz/~ml/weka/arff.html) and create {@link Vector}s * <p/> http://git-wip-us.apache.org/repos/asf/mahout/blob/85f9ece6/integration/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java b/integration/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java index 63a9f0d..ccecbb1 100644 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java +++ b/integration/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java @@ -5,9 +5,9 @@ * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -21,17 +21,16 @@ import java.io.File; import java.io.FilenameFilter; import java.io.IOException; import java.io.Writer; +import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; +import java.util.HashMap; +import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; -import com.google.common.base.Charsets; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; -import com.google.common.io.Closeables; import com.google.common.io.Files; import org.apache.commons.cli2.CommandLine; import org.apache.commons.cli2.Group; @@ -41,6 +40,7 @@ import org.apache.commons.cli2.builder.ArgumentBuilder; import org.apache.commons.cli2.builder.DefaultOptionBuilder; import org.apache.commons.cli2.builder.GroupBuilder; import org.apache.commons.cli2.commandline.Parser; +import org.apache.commons.io.Charsets; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -62,42 +62,43 @@ public final class Driver { /** used for JSON serialization/deserialization */ private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private Driver() { } - + private Driver() { + } + public static void main(String[] args) throws IOException { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); - + Option inputOpt = obuilder .withLongName("input") .withRequired(true) .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create()) .withDescription( - "The file or directory containing the ARFF files. If it is a directory, all .arff files will be converted") + "The file or directory containing the ARFF files. If it is a directory, all .arff files will be converted") .withShortName("d").create(); - + Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument( - abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription( - "The output directory. Files will have the same name as the input, but with the extension .mvc") + abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription( + "The output directory. Files will have the same name as the input, but with the extension .mvc") .withShortName("o").create(); - + Option maxOpt = obuilder.withLongName("max").withRequired(false).withArgument( - abuilder.withName("max").withMinimum(1).withMaximum(1).create()).withDescription( - "The maximum number of vectors to output. If not specified, then it will loop over all docs") + abuilder.withName("max").withMinimum(1).withMaximum(1).create()).withDescription( + "The maximum number of vectors to output. If not specified, then it will loop over all docs") .withShortName("m").create(); - + Option dictOutOpt = obuilder.withLongName("dictOut").withRequired(true).withArgument( - abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create()).withDescription( - "The file to output the label bindings").withShortName("t").create(); - + abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create()).withDescription( + "The file to output the label bindings").withShortName("t").create(); + Option jsonDictonaryOpt = obuilder.withLongName("json-dictonary").withRequired(false) - .withDescription("Write dictonary in JSON format").withShortName("j").create(); - + .withDescription("Write dictonary in JSON format").withShortName("j").create(); + Option delimiterOpt = obuilder.withLongName("delimiter").withRequired(false).withArgument( - abuilder.withName("delimiter").withMinimum(1).withMaximum(1).create()).withDescription( - "The delimiter for outputing the dictionary").withShortName("l").create(); - + abuilder.withName("delimiter").withMinimum(1).withMaximum(1).create()).withDescription( + "The delimiter for outputing the dictionary").withShortName("l").create(); + Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h") .create(); Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(maxOpt) @@ -108,9 +109,9 @@ public final class Driver { Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine = parser.parse(args); - + if (cmdLine.hasOption(helpOpt)) { - + CommandLineUtil.printHelp(group); return; } @@ -137,7 +138,7 @@ public final class Driver { return name.endsWith(".arff"); } }); - + for (File file : files) { writeFile(outDir, file, maxDocs, model, dictOut, delimiter, jsonDictonary); } @@ -145,31 +146,28 @@ public final class Driver { writeFile(outDir, input, maxDocs, model, dictOut, delimiter, jsonDictonary); } } - + } catch (OptionException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } } - + protected static void writeLabelBindings(File dictOut, ARFFModel arffModel, String delimiter, boolean jsonDictonary) - throws IOException { - Writer writer = Files.newWriterSupplier(dictOut, Charsets.UTF_8, true).getOutput(); - try { + throws IOException { + try (Writer writer = Files.newWriterSupplier(dictOut, Charsets.UTF_8, true).getOutput()) { if (jsonDictonary) { writeLabelBindingsJSON(writer, arffModel); } else { writeLabelBindings(writer, arffModel, delimiter); } - } finally { - Closeables.close(writer, false); } } - protected static void writeLabelBindingsJSON(Writer writer, ARFFModel arffModel) throws IOException { + protected static void writeLabelBindingsJSON(Writer writer, ARFFModel arffModel) throws IOException { // Turn the map of labels into a list order by order of appearance - List<Entry<String, Integer>> attributes = Lists.newArrayList(); + List<Entry<String, Integer>> attributes = new ArrayList<>(); attributes.addAll(arffModel.getLabelBindings().entrySet()); Collections.sort(attributes, new Comparator<Map.Entry<String, Integer>>() { @Override @@ -177,13 +175,13 @@ public final class Driver { return t.getValue().compareTo(t1.getValue()); } }); - + // write a map for each object - List<Map<String, Object>> jsonObjects = Lists.newLinkedList(); - for (int i = 0; i < attributes.size(); i++) { - + List<Map<String, Object>> jsonObjects = new LinkedList<>(); + for (int i = 0; i < attributes.size(); i++) { + Entry<String, Integer> modelRepresentation = attributes.get(i); - Map<String, Object> jsonRepresentation = Maps.newHashMap(); + Map<String, Object> jsonRepresentation = new HashMap<>(); jsonObjects.add(jsonRepresentation); // the last one is the class label jsonRepresentation.put("label", i < (attributes.size() - 1) ? String.valueOf(false) : String.valueOf(true)); @@ -232,37 +230,34 @@ public final class Driver { } } } - + protected static void writeFile(String outDir, - File file, - long maxDocs, - ARFFModel arffModel, - File dictOut, - String delimiter, - boolean jsonDictonary) throws IOException { + File file, + long maxDocs, + ARFFModel arffModel, + File dictOut, + String delimiter, + boolean jsonDictonary) throws IOException { log.info("Converting File: {}", file); ARFFModel model = new MapBackedARFFModel(arffModel.getWords(), arffModel.getWordCount() + 1, arffModel .getNominalMap()); Iterable<Vector> iteratable = new ARFFVectorIterable(file, model); String outFile = outDir + '/' + file.getName() + ".mvc"; - - VectorWriter vectorWriter = getSeqFileWriter(outFile); - try { + + try (VectorWriter vectorWriter = getSeqFileWriter(outFile)) { long numDocs = vectorWriter.write(iteratable, maxDocs); writeLabelBindings(dictOut, model, delimiter, jsonDictonary); log.info("Wrote: {} vectors", numDocs); - } finally { - Closeables.close(vectorWriter, false); } } - + private static VectorWriter getSeqFileWriter(String outFile) throws IOException { Path path = new Path(outFile); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, path, LongWritable.class, - VectorWritable.class); + VectorWritable.class); return new SequenceFileVectorWriter(seqWriter); } - + } http://git-wip-us.apache.org/repos/asf/mahout/blob/85f9ece6/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java b/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java index a272053..e911b1a 100644 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java +++ b/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java @@ -17,8 +17,6 @@ package org.apache.mahout.utils.vectors.arff; -import com.google.common.collect.Maps; - import java.text.DateFormat; import java.text.NumberFormat; import java.text.ParseException; @@ -56,10 +54,10 @@ public class MapBackedARFFModel implements ARFFModel { public MapBackedARFFModel(Map<String,Long> words, long wordCount, Map<String,Map<String,Integer>> nominalMap) { this.words = words; this.wordCount = wordCount; - labelBindings = Maps.newHashMap(); - idxLabel = Maps.newHashMap(); - typeMap = Maps.newHashMap(); - dateMap = Maps.newHashMap(); + labelBindings = new HashMap<>(); + idxLabel = new HashMap<>(); + typeMap = new HashMap<>(); + dateMap = new HashMap<>(); this.nominalMap = nominalMap; } @@ -230,7 +228,7 @@ public class MapBackedARFFModel implements ARFFModel { public void addNominal(String label, String nominal, int idx) { Map<String,Integer> noms = nominalMap.get(label); if (noms == null) { - noms = Maps.newHashMap(); + noms = new HashMap<>(); nominalMap.put(label, noms); } noms.put(nominal, idx); http://git-wip-us.apache.org/repos/asf/mahout/blob/85f9ece6/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java index ffe7baa..718704a 100644 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java +++ b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java @@ -17,7 +17,11 @@ package org.apache.mahout.utils.vectors.lucene; -import com.google.common.collect.Maps; +import java.io.IOException; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.Map; + import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.Terms; @@ -26,10 +30,6 @@ import org.apache.lucene.util.BytesRef; import org.apache.mahout.utils.vectors.TermEntry; import org.apache.mahout.utils.vectors.TermInfo; -import java.io.IOException; -import java.util.Iterator; -import java.util.Map; - /** * Caches TermEntries from a single field. Materializes all values in the TermEnum to memory (much like FieldCache) @@ -47,7 +47,7 @@ public class CachedTermInfo implements TermInfo { int numDocs = reader.numDocs(); double percent = numDocs * maxDfPercent / 100.0; //Should we use a linked hash map so that we know terms are in order? - termEntries = Maps.newLinkedHashMap(); + termEntries = new LinkedHashMap<>(); int count = 0; BytesRef text; while ((text = te.next()) != null) { http://git-wip-us.apache.org/repos/asf/mahout/blob/85f9ece6/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java index 06bec60..6ef7fba 100644 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java +++ b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java @@ -23,15 +23,14 @@ import java.io.OutputStreamWriter; import java.io.Writer; import java.util.Collection; import java.util.Collections; +import java.util.HashSet; import java.util.LinkedHashMap; +import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeSet; -import com.google.common.base.Charsets; -import com.google.common.collect.Lists; -import com.google.common.collect.Sets; import com.google.common.io.Closeables; import com.google.common.io.Files; import org.apache.commons.cli2.CommandLine; @@ -42,6 +41,7 @@ import org.apache.commons.cli2.builder.ArgumentBuilder; import org.apache.commons.cli2.builder.DefaultOptionBuilder; import org.apache.commons.cli2.builder.GroupBuilder; import org.apache.commons.cli2.commandline.Parser; +import org.apache.commons.io.Charsets; import org.apache.hadoop.fs.Path; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.DocsEnum; @@ -107,13 +107,8 @@ public class ClusterLabels { public void getLabels() throws IOException { - Writer writer; - if (this.output == null) { - writer = new OutputStreamWriter(System.out, Charsets.UTF_8); - } else { - writer = Files.newWriter(new File(this.output), Charsets.UTF_8); - } - try { + try (Writer writer = (this.output == null) ? + new OutputStreamWriter(System.out, Charsets.UTF_8) : Files.newWriter(new File(this.output), Charsets.UTF_8)){ for (Map.Entry<Integer, List<WeightedPropertyVectorWritable>> integerListEntry : clusterIdToPoints.entrySet()) { List<WeightedPropertyVectorWritable> wpvws = integerListEntry.getValue(); List<TermInfoClusterInOut> termInfos = getClusterLabels(integerListEntry.getKey(), wpvws); @@ -139,8 +134,6 @@ public class ClusterLabels { } } } - } finally { - Closeables.close(writer, false); } } @@ -162,7 +155,7 @@ public class ClusterLabels { log.info("# of documents in the index {}", reader.numDocs()); - Collection<String> idSet = Sets.newHashSet(); + Collection<String> idSet = new HashSet<>(); for (WeightedPropertyVectorWritable wpvw : wpvws) { Vector vector = wpvw.getVector(); if (vector instanceof NamedVector) { @@ -214,7 +207,7 @@ public class ClusterLabels { } - List<TermInfoClusterInOut> clusteredTermInfo = Lists.newLinkedList(); + List<TermInfoClusterInOut> clusteredTermInfo = new LinkedList<>(); int clusterSize = wpvws.size(); http://git-wip-us.apache.org/repos/asf/mahout/blob/85f9ece6/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java index bdc5652..2eeebd9 100644 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java +++ b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java @@ -5,9 +5,9 @@ * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -22,9 +22,7 @@ import java.io.IOException; import java.io.Writer; import java.util.Iterator; -import com.google.common.base.Charsets; import com.google.common.base.Preconditions; -import com.google.common.io.Closeables; import com.google.common.io.Files; import org.apache.commons.cli2.CommandLine; import org.apache.commons.cli2.Group; @@ -34,6 +32,7 @@ import org.apache.commons.cli2.builder.ArgumentBuilder; import org.apache.commons.cli2.builder.DefaultOptionBuilder; import org.apache.commons.cli2.builder.GroupBuilder; import org.apache.commons.cli2.commandline.Parser; +import org.apache.commons.io.Charsets; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -80,15 +79,15 @@ public final class Driver { File file = new File(luceneDir); Preconditions.checkArgument(file.isDirectory(), - "Lucene directory: " + file.getAbsolutePath() - + " does not exist or is not a directory"); + "Lucene directory: " + file.getAbsolutePath() + + " does not exist or is not a directory"); Preconditions.checkArgument(maxDocs >= 0, "maxDocs must be >= 0"); Preconditions.checkArgument(minDf >= 1, "minDf must be >= 1"); Preconditions.checkArgument(maxDFPercent <= 99, "maxDFPercent must be <= 99"); Directory dir = FSDirectory.open(file); IndexReader reader = DirectoryReader.open(dir); - + Weight weight; if ("tf".equalsIgnoreCase(weightType)) { @@ -100,7 +99,7 @@ public final class Driver { } TermInfo termInfo = new CachedTermInfo(reader, field, minDf, maxDFPercent); - + LuceneIterable iterable; if (norm == LuceneIterable.NO_NORMALIZING) { iterable = new LuceneIterable(reader, idField, field, termInfo, weight, LuceneIterable.NO_NORMALIZING, @@ -111,22 +110,16 @@ public final class Driver { log.info("Output File: {}", outFile); - VectorWriter vectorWriter = getSeqFileWriter(outFile); - try { + try (VectorWriter vectorWriter = getSeqFileWriter(outFile)) { long numDocs = vectorWriter.write(iterable, maxDocs); log.info("Wrote: {} vectors", numDocs); - } finally { - Closeables.close(vectorWriter, false); } File dictOutFile = new File(dictOut); log.info("Dictionary Output file: {}", dictOutFile); Writer writer = Files.newWriter(dictOutFile, Charsets.UTF_8); - DelimitedTermInfoWriter tiWriter = new DelimitedTermInfoWriter(writer, delimiter, field); - try { + try (DelimitedTermInfoWriter tiWriter = new DelimitedTermInfoWriter(writer, delimiter, field)) { tiWriter.write(termInfo); - } finally { - Closeables.close(tiWriter, false); } if (!"".equals(seqDictOut)) { @@ -135,12 +128,9 @@ public final class Driver { Path path = new Path(seqDictOut); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); - SequenceFile.Writer seqWriter = null; - try { - seqWriter = SequenceFile.createWriter(fs, conf, path, Text.class, IntWritable.class); + try (SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, path, Text.class, IntWritable.class)) { Text term = new Text(); IntWritable termIndex = new IntWritable(); - Iterator<TermEntry> termEntries = termInfo.getAllEntries(); while (termEntries.hasNext()) { TermEntry termEntry = termEntries.next(); @@ -148,10 +138,7 @@ public final class Driver { termIndex.set(termEntry.getTermIdx()); seqWriter.append(term, termIndex); } - } finally { - Closeables.close(seqWriter, false); } - } } @@ -215,7 +202,7 @@ public final class Driver { Option maxPercentErrorDocsOpt = obuilder.withLongName("maxPercentErrorDocs").withRequired(false).withArgument( abuilder.withName("maxPercentErrorDocs").withMinimum(1).withMaximum(1).create()).withDescription( - "The max percentage of docs that can have a null term vector. These are noise document and can occur if the " + "The max percentage of docs that can have a null term vector. These are noise document and can occur if the " + "analyzer used strips out all terms in the target field. This percentage is expressed as a value " + "between 0 and 1. The default is 0.").withShortName("err").create(); @@ -302,7 +289,7 @@ public final class Driver { // TODO: Make this parameter driven SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, path, LongWritable.class, - VectorWritable.class); + VectorWritable.class); return new SequenceFileVectorWriter(seqWriter); } http://git-wip-us.apache.org/repos/asf/mahout/blob/85f9ece6/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java index 70394ac..6a8c659 100644 --- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java +++ b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java @@ -17,15 +17,15 @@ package org.apache.mahout.utils.vectors.lucene; +import java.io.IOException; +import java.util.Set; +import java.util.TreeSet; + import com.google.common.base.Preconditions; -import com.google.common.collect.Sets; import org.apache.lucene.index.IndexReader; import org.apache.mahout.utils.vectors.TermInfo; import org.apache.mahout.vectorizer.Weight; -import java.io.IOException; -import java.util.Set; - /** * An {@link java.util.Iterator} over {@link org.apache.mahout.math.Vector}s that uses a Lucene index as the source * for creating the {@link org.apache.mahout.math.Vector}s. The field used to create the vectors currently must have @@ -77,7 +77,7 @@ public class LuceneIterator extends AbstractLuceneIterator { "Must be: 0.0 <= maxPercentErrorDocs <= 1.0"); this.idField = idField; if (idField != null) { - idFieldSelector = Sets.newTreeSet(); + idFieldSelector = new TreeSet<>(); idFieldSelector.add(idField); } else { /*The field in the index containing the index. If null, then the Lucene internal doc id is used http://git-wip-us.apache.org/repos/asf/mahout/blob/85f9ece6/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java b/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java index 4bbab65..a1d2bbb 100644 --- a/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java +++ b/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java @@ -17,8 +17,11 @@ package org.apache.mahout.clustering; -import com.google.common.collect.Lists; -import com.google.common.io.Closeables; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -52,10 +55,6 @@ import org.apache.mahout.vectorizer.Weight; import org.junit.Before; import org.junit.Test; -import java.io.IOException; -import java.util.Iterator; -import java.util.List; - public final class TestClusterDumper extends MahoutTestCase { private static final String[] DOCS = { @@ -92,13 +91,10 @@ public final class TestClusterDumper extends MahoutTestCase { } private void getSampleData(String[] docs2) throws IOException { - sampleData = Lists.newArrayList(); + sampleData = new ArrayList<>(); RAMDirectory directory = new RAMDirectory(); - - IndexWriter writer = new IndexWriter(directory, - new IndexWriterConfig(Version.LUCENE_46, new StandardAnalyzer(Version.LUCENE_46))); - - try { + try (IndexWriter writer = new IndexWriter(directory, + new IndexWriterConfig(Version.LUCENE_46, new StandardAnalyzer(Version.LUCENE_46)))){ for (int i = 0; i < docs2.length; i++) { Document doc = new Document(); Field id = new StringField("id", "doc_" + i, Field.Store.YES); @@ -116,13 +112,10 @@ public final class TestClusterDumper extends MahoutTestCase { doc.add(text); writer.addDocument(doc); } - } finally { - Closeables.close(writer, false); } IndexReader reader = DirectoryReader.open(directory); - Weight weight = new TFIDF(); TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100); http://git-wip-us.apache.org/repos/asf/mahout/blob/85f9ece6/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java b/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java index 78367cc..597ed01 100644 --- a/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java +++ b/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java @@ -18,7 +18,9 @@ package org.apache.mahout.clustering.cdbw; import java.io.IOException; +import java.util.ArrayList; import java.util.Collection; +import java.util.HashMap; import java.util.List; import java.util.Map; @@ -28,9 +30,9 @@ import org.apache.hadoop.fs.Path; import org.apache.mahout.clustering.Cluster; import org.apache.mahout.clustering.ClusteringTestUtils; import org.apache.mahout.clustering.TestClusterEvaluator; +import org.apache.mahout.clustering.UncommonDistributions; import org.apache.mahout.clustering.canopy.Canopy; import org.apache.mahout.clustering.canopy.CanopyDriver; -import org.apache.mahout.clustering.UncommonDistributions; import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver; import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver; import org.apache.mahout.clustering.kmeans.KMeansDriver; @@ -46,9 +48,6 @@ import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; - public final class TestCDbwEvaluator extends MahoutTestCase { private static final double[][] REFERENCE = { {1, 1}, {2, 1}, {1, 2}, {2, 2}, {3, 3}, {4, 4}, {5, 4}, {4, 5}, {5, 5}}; @@ -63,9 +62,9 @@ public final class TestCDbwEvaluator extends MahoutTestCase { private FileSystem fs; - private final Collection<VectorWritable> sampleData = Lists.newArrayList(); + private final Collection<VectorWritable> sampleData = new ArrayList<>(); - private List<VectorWritable> referenceData = Lists.newArrayList(); + private List<VectorWritable> referenceData = new ArrayList<>(); private Path testdata; @@ -96,14 +95,14 @@ public final class TestCDbwEvaluator extends MahoutTestCase { * the DistanceMeasure */ private void initData(double dC, double dP, DistanceMeasure measure) { - clusters = Lists.newArrayList(); + clusters = new ArrayList<>(); clusters.add(new Canopy(new DenseVector(new double[] {-dC, -dC}), 1, measure)); clusters.add(new Canopy(new DenseVector(new double[] {-dC, dC}), 3, measure)); clusters.add(new Canopy(new DenseVector(new double[] {dC, dC}), 5, measure)); clusters.add(new Canopy(new DenseVector(new double[] {dC, -dC}), 7, measure)); - representativePoints = Maps.newHashMap(); + representativePoints = new HashMap<>(); for (Cluster cluster : clusters) { - List<VectorWritable> points = Lists.newArrayList(); + List<VectorWritable> points = new ArrayList<>(); representativePoints.put(cluster.getId(), points); points.add(new VectorWritable(cluster.getCenter().clone())); points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {dP, dP})))); @@ -182,7 +181,7 @@ public final class TestCDbwEvaluator extends MahoutTestCase { initData(1, 0.25, measure); Canopy cluster = new Canopy(new DenseVector(new double[] {10, 10}), 19, measure); clusters.add(cluster); - List<VectorWritable> points = Lists.newArrayList(); + List<VectorWritable> points = new ArrayList<>(); representativePoints.put(cluster.getId(), points); CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure); System.out.println("CDbw = " + evaluator.getCDbw()); @@ -198,7 +197,7 @@ public final class TestCDbwEvaluator extends MahoutTestCase { initData(1, 0.25, measure); Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure); clusters.add(cluster); - List<VectorWritable> points = Lists.newArrayList(); + List<VectorWritable> points = new ArrayList<>(); points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {1, 1})))); representativePoints.put(cluster.getId(), points); CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure); @@ -221,7 +220,7 @@ public final class TestCDbwEvaluator extends MahoutTestCase { initData(1, 0.25, measure); Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure); clusters.add(cluster); - List<VectorWritable> points = Lists.newArrayList(); + List<VectorWritable> points = new ArrayList<>(); points.add(new VectorWritable(cluster.getCenter())); points.add(new VectorWritable(cluster.getCenter())); points.add(new VectorWritable(cluster.getCenter())); @@ -246,7 +245,7 @@ public final class TestCDbwEvaluator extends MahoutTestCase { initData(1, 0.25, measure); Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure); clusters.add(cluster); - List<VectorWritable> points = Lists.newArrayList(); + List<VectorWritable> points = new ArrayList<>(); Vector delta = new DenseVector(new double[] {0, Double.MIN_NORMAL}); points.add(new VectorWritable(delta.clone())); points.add(new VectorWritable(delta.clone())); http://git-wip-us.apache.org/repos/asf/mahout/blob/85f9ece6/integration/src/test/java/org/apache/mahout/text/AbstractLuceneStorageTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/text/AbstractLuceneStorageTest.java b/integration/src/test/java/org/apache/mahout/text/AbstractLuceneStorageTest.java index baa7ac5..c7486c5 100644 --- a/integration/src/test/java/org/apache/mahout/text/AbstractLuceneStorageTest.java +++ b/integration/src/test/java/org/apache/mahout/text/AbstractLuceneStorageTest.java @@ -16,7 +16,11 @@ */ package org.apache.mahout.text; -import com.google.common.collect.Lists; +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; @@ -33,11 +37,6 @@ import org.apache.mahout.text.doc.NumericFieldDocument; import org.apache.mahout.text.doc.SingleFieldDocument; import org.apache.mahout.text.doc.TestDocument; -import java.io.File; -import java.io.IOException; -import java.util.Arrays; -import java.util.List; - /** * Abstract test for working with Lucene storage. */ @@ -45,8 +44,8 @@ public abstract class AbstractLuceneStorageTest extends MahoutTestCase { protected Path indexPath1; protected Path indexPath2; - protected List<TestDocument> docs = Lists.newArrayList(); - protected List<TestDocument> misshapenDocs = Lists.newArrayList(); + protected List<TestDocument> docs = new ArrayList<>(); + protected List<TestDocument> misshapenDocs = new ArrayList<>(); @Override public void setUp() throws Exception { http://git-wip-us.apache.org/repos/asf/mahout/blob/85f9ece6/integration/src/test/java/org/apache/mahout/text/LuceneSegmentRecordReaderTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/text/LuceneSegmentRecordReaderTest.java b/integration/src/test/java/org/apache/mahout/text/LuceneSegmentRecordReaderTest.java index 28f2ac8..c64dbda 100644 --- a/integration/src/test/java/org/apache/mahout/text/LuceneSegmentRecordReaderTest.java +++ b/integration/src/test/java/org/apache/mahout/text/LuceneSegmentRecordReaderTest.java @@ -16,23 +16,24 @@ */ package org.apache.mahout.text; +import java.io.IOException; +import java.lang.reflect.InvocationTargetException; +import java.util.Collections; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.TaskAttemptID; -import org.apache.lucene.index.*; +import org.apache.lucene.index.SegmentCommitInfo; +import org.apache.lucene.index.SegmentInfos; import org.apache.mahout.common.HadoopUtil; - import org.junit.After; import org.junit.Before; import org.junit.Test; -import java.io.IOException; -import java.lang.reflect.InvocationTargetException; - -import static java.util.Arrays.asList; -import static org.apache.mahout.text.doc.SingleFieldDocument.*; +import static org.apache.mahout.text.doc.SingleFieldDocument.FIELD; +import static org.apache.mahout.text.doc.SingleFieldDocument.ID_FIELD; public class LuceneSegmentRecordReaderTest extends AbstractLuceneStorageTest { private Configuration configuration; @@ -44,7 +45,8 @@ public class LuceneSegmentRecordReaderTest extends AbstractLuceneStorageTest { @Before public void before() throws IOException, InterruptedException { LuceneStorageConfiguration lucene2SeqConf = new LuceneStorageConfiguration(getConfiguration(), - asList(getIndexPath1()), new Path("output"), ID_FIELD, asList(FIELD)); + Collections.singletonList(getIndexPath1()), new Path("output"), ID_FIELD, + Collections.singletonList(FIELD)); configuration = lucene2SeqConf.serialize(); recordReader = new LuceneSegmentRecordReader(); commitDocuments(getDirectory(getIndexPath1AsFile()), docs.subList(0, 500)); @@ -82,7 +84,8 @@ public class LuceneSegmentRecordReaderTest extends AbstractLuceneStorageTest { @Test(expected = IllegalArgumentException.class) public void testNonExistingIdField() throws Exception { configuration = new LuceneStorageConfiguration(getConfiguration(), - asList(getIndexPath1()), new Path("output"), "nonExistingId", asList(FIELD)).serialize(); + Collections.singletonList(getIndexPath1()), new Path("output"), "nonExistingId", + Collections.singletonList(FIELD)).serialize(); SegmentCommitInfo segmentInfo = segmentInfos.iterator().next(); LuceneSegmentInputSplit inputSplit = new LuceneSegmentInputSplit(getIndexPath1(), segmentInfo.info.name, segmentInfo.sizeInBytes()); @@ -92,8 +95,8 @@ public class LuceneSegmentRecordReaderTest extends AbstractLuceneStorageTest { @Test(expected = IllegalArgumentException.class) public void testNonExistingField() throws Exception { - configuration = new LuceneStorageConfiguration(getConfiguration(), asList(getIndexPath1()), - new Path("output"), ID_FIELD, asList("nonExistingField")).serialize(); + configuration = new LuceneStorageConfiguration(getConfiguration(), Collections.singletonList(getIndexPath1()), + new Path("output"), ID_FIELD, Collections.singletonList("nonExistingField")).serialize(); SegmentCommitInfo segmentInfo = segmentInfos.iterator().next(); LuceneSegmentInputSplit inputSplit = new LuceneSegmentInputSplit(getIndexPath1(), segmentInfo.info.name, segmentInfo.sizeInBytes()); http://git-wip-us.apache.org/repos/asf/mahout/blob/85f9ece6/integration/src/test/java/org/apache/mahout/text/LuceneStorageConfigurationTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/text/LuceneStorageConfigurationTest.java b/integration/src/test/java/org/apache/mahout/text/LuceneStorageConfigurationTest.java index d1e65c1..f58224c 100644 --- a/integration/src/test/java/org/apache/mahout/text/LuceneStorageConfigurationTest.java +++ b/integration/src/test/java/org/apache/mahout/text/LuceneStorageConfigurationTest.java @@ -16,16 +16,14 @@ */ package org.apache.mahout.text; +import java.io.IOException; +import java.util.Collections; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.mahout.common.MahoutTestCase; import org.junit.Test; -import java.io.IOException; - -import static java.util.Arrays.asList; -import static org.junit.Assert.assertEquals; - public class LuceneStorageConfigurationTest extends MahoutTestCase { @Test @@ -34,7 +32,8 @@ public class LuceneStorageConfigurationTest extends MahoutTestCase { Path indexPath = new Path("indexPath"); Path outputPath = new Path("outputPath"); LuceneStorageConfiguration luceneStorageConfiguration = - new LuceneStorageConfiguration(configuration, asList(indexPath), outputPath, "id", asList("field")); + new LuceneStorageConfiguration(configuration, Collections.singletonList(indexPath), outputPath, + "id", Collections.singletonList("field")); Configuration serializedConfiguration = luceneStorageConfiguration.serialize(); http://git-wip-us.apache.org/repos/asf/mahout/blob/85f9ece6/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriverTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriverTest.java b/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriverTest.java index 7cebc60..03aed0f 100644 --- a/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriverTest.java +++ b/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriverTest.java @@ -17,28 +17,23 @@ package org.apache.mahout.text; -import com.google.common.collect.Iterators; -import org.apache.commons.lang.StringUtils; +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.Writable; import org.apache.lucene.search.TermQuery; import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.common.iterator.sequencefile.PathFilters; -import org.apache.mahout.common.iterator.sequencefile.PathType; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterator; import org.apache.mahout.text.doc.MultipleFieldsDocument; import org.apache.mahout.text.doc.SingleFieldDocument; import org.junit.After; import org.junit.Before; import org.junit.Test; -import java.io.IOException; -import java.util.List; - -import static java.util.Arrays.asList; - public class SequenceFilesFromLuceneStorageDriverTest extends AbstractLuceneStorageTest { private SequenceFilesFromLuceneStorageDriver driver; @@ -56,7 +51,7 @@ public class SequenceFilesFromLuceneStorageDriverTest extends AbstractLuceneStor seqFilesOutputPath = new Path(getTestTempDirPath(), "seqfiles"); idField = SingleFieldDocument.ID_FIELD; - fields = asList("field"); + fields = Collections.singletonList("field"); driver = new SequenceFilesFromLuceneStorageDriver() { @Override @@ -76,13 +71,13 @@ public class SequenceFilesFromLuceneStorageDriverTest extends AbstractLuceneStor @Test public void testNewLucene2SeqConfiguration() { lucene2SeqConf = driver.newLucene2SeqConfiguration(conf, - asList(new Path(getIndexPath1().toString())), + Collections.singletonList(new Path(getIndexPath1().toString())), seqFilesOutputPath, idField, fields); assertEquals(conf, lucene2SeqConf.getConfiguration()); - assertEquals(asList(getIndexPath1()), lucene2SeqConf.getIndexPaths()); + assertEquals(Collections.singletonList(getIndexPath1()), lucene2SeqConf.getIndexPaths()); assertEquals(seqFilesOutputPath, lucene2SeqConf.getSequenceFilesOutputPath()); assertEquals(idField, lucene2SeqConf.getIdField()); assertEquals(fields, lucene2SeqConf.getFields()); @@ -90,7 +85,8 @@ public class SequenceFilesFromLuceneStorageDriverTest extends AbstractLuceneStor @Test public void testRun() throws Exception { - List<MultipleFieldsDocument> docs = asList(new MultipleFieldsDocument("123", "test 1", "test 2", "test 3")); + List<MultipleFieldsDocument> docs = + Collections.singletonList(new MultipleFieldsDocument("123", "test 1", "test 2", "test 3")); commitDocuments(getDirectory(getIndexPath1AsFile()), docs.get(0)); String queryField = "queryfield"; @@ -115,7 +111,7 @@ public class SequenceFilesFromLuceneStorageDriverTest extends AbstractLuceneStor assertEquals(getIndexPath1().toUri().getPath(), lucene2SeqConf.getIndexPaths().get(0).toUri().getPath()); assertEquals(seqFilesOutputPath, lucene2SeqConf.getSequenceFilesOutputPath()); assertEquals(idField, lucene2SeqConf.getIdField()); - assertEquals(asList(field1, field2), lucene2SeqConf.getFields()); + assertEquals(Arrays.asList(field1, field2), lucene2SeqConf.getFields()); assertTrue(lucene2SeqConf.getQuery() instanceof TermQuery); assertEquals(queryField, ((TermQuery) lucene2SeqConf.getQuery()).getTerm().field()); @@ -167,10 +163,6 @@ public class SequenceFilesFromLuceneStorageDriverTest extends AbstractLuceneStor driver.run(args); assertTrue(FileSystem.get(conf).exists(seqFilesOutputPath)); //shouldn't be any real files in the seq files out path - SequenceFileDirIterator<Writable, Writable> iter = - new SequenceFileDirIterator<Writable, Writable>(seqFilesOutputPath, PathType.LIST, PathFilters.logsCRCFilter(), null, false, conf); - assertFalse(Iterators.size(iter) > 0); - } @Test http://git-wip-us.apache.org/repos/asf/mahout/blob/85f9ece6/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMRJobTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMRJobTest.java b/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMRJobTest.java index 111bc85..fc03e49 100644 --- a/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMRJobTest.java +++ b/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMRJobTest.java @@ -16,7 +16,12 @@ */ package org.apache.mahout.text; -import com.google.common.collect.Maps; +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; @@ -28,10 +33,6 @@ import org.junit.After; import org.junit.Before; import org.junit.Test; -import java.io.IOException; -import java.util.Iterator; -import java.util.Map; - import static java.util.Arrays.asList; public class SequenceFilesFromLuceneStorageMRJobTest extends AbstractLuceneStorageTest { @@ -45,7 +46,7 @@ public class SequenceFilesFromLuceneStorageMRJobTest extends AbstractLuceneStora Configuration configuration = getConfiguration(); Path seqOutputPath = new Path(getTestTempDirPath(), "seqOutputPath");//don't make the output directory lucene2SeqConf = new LuceneStorageConfiguration(configuration, asList(getIndexPath1(), getIndexPath2()), - seqOutputPath, SingleFieldDocument.ID_FIELD, asList(SingleFieldDocument.FIELD)); + seqOutputPath, SingleFieldDocument.ID_FIELD, Collections.singletonList(SingleFieldDocument.FIELD)); } @After @@ -66,7 +67,7 @@ public class SequenceFilesFromLuceneStorageMRJobTest extends AbstractLuceneStora lucene2seq.run(lucene2SeqConf); Iterator<Pair<Text, Text>> iterator = lucene2SeqConf.getSequenceFileIterator(); - Map<String, Text> map = Maps.newHashMap(); + Map<String, Text> map = new HashMap<>(); while (iterator.hasNext()) { Pair<Text, Text> next = iterator.next(); map.put(next.getFirst().toString(), next.getSecond()); http://git-wip-us.apache.org/repos/asf/mahout/blob/85f9ece6/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageTest.java b/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageTest.java index ccff1d6..3cd87f2 100644 --- a/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageTest.java +++ b/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageTest.java @@ -16,7 +16,13 @@ */ package org.apache.mahout.text; -import com.google.common.collect.Maps; +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; @@ -37,12 +43,6 @@ import org.junit.After; import org.junit.Before; import org.junit.Test; -import java.io.IOException; -import java.util.Iterator; -import java.util.Map; - -import static java.util.Arrays.asList; - public class SequenceFilesFromLuceneStorageTest extends AbstractLuceneStorageTest { private SequenceFilesFromLuceneStorage lucene2Seq; @@ -57,11 +57,8 @@ public class SequenceFilesFromLuceneStorageTest extends AbstractLuceneStorageTes lucene2Seq = new SequenceFilesFromLuceneStorage(); lucene2SeqConf = new LuceneStorageConfiguration(configuration, - asList(getIndexPath1(), getIndexPath2()), - seqFilesOutputPath, - SingleFieldDocument.ID_FIELD, - asList(SingleFieldDocument.FIELD)); - + Arrays.asList(getIndexPath1(), getIndexPath2()), seqFilesOutputPath, + SingleFieldDocument.ID_FIELD, Collections.singletonList(SingleFieldDocument.FIELD)); } @After @@ -83,7 +80,7 @@ public class SequenceFilesFromLuceneStorageTest extends AbstractLuceneStorageTes lucene2Seq.run(lucene2SeqConf); Iterator<Pair<Text, Text>> iterator = lucene2SeqConf.getSequenceFileIterator(); - Map<String, Text> map = Maps.newHashMap(); + Map<String, Text> map = new HashMap<>(); while (iterator.hasNext()) { Pair<Text, Text> next = iterator.next(); map.put(next.getFirst().toString(), next.getSecond()); @@ -106,10 +103,8 @@ public class SequenceFilesFromLuceneStorageTest extends AbstractLuceneStorageTes commitDocuments(getDirectory(getIndexPath1AsFile()), new UnstoredFieldsDocument("5", "This is test document 5")); LuceneStorageConfiguration lucene2SeqConf = new LuceneStorageConfiguration(configuration, - asList(getIndexPath1()), - seqFilesOutputPath, - SingleFieldDocument.ID_FIELD, - asList(UnstoredFieldsDocument.FIELD, UnstoredFieldsDocument.UNSTORED_FIELD)); + Collections.singletonList(getIndexPath1()), seqFilesOutputPath, + SingleFieldDocument.ID_FIELD, Arrays.asList(UnstoredFieldsDocument.FIELD, UnstoredFieldsDocument.UNSTORED_FIELD)); lucene2Seq.run(lucene2SeqConf); } @@ -139,10 +134,8 @@ public class SequenceFilesFromLuceneStorageTest extends AbstractLuceneStorageTes public void testRunQuery() throws IOException { commitDocuments(getDirectory(getIndexPath1AsFile()), docs); LuceneStorageConfiguration lucene2SeqConf = new LuceneStorageConfiguration(configuration, - asList(getIndexPath1()), - seqFilesOutputPath, - SingleFieldDocument.ID_FIELD, - asList(SingleFieldDocument.FIELD)); + Collections.singletonList(getIndexPath1()), seqFilesOutputPath, + SingleFieldDocument.ID_FIELD, Collections.singletonList(SingleFieldDocument.FIELD)); Query query = new TermQuery(new Term(lucene2SeqConf.getFields().get(0), "599")); @@ -159,15 +152,18 @@ public class SequenceFilesFromLuceneStorageTest extends AbstractLuceneStorageTes @Test public void testRunMultipleFields() throws IOException { LuceneStorageConfiguration lucene2SeqConf = new LuceneStorageConfiguration(configuration, - asList(getIndexPath1()), - seqFilesOutputPath, + Collections.singletonList(getIndexPath1()), seqFilesOutputPath, SingleFieldDocument.ID_FIELD, - asList(MultipleFieldsDocument.FIELD, MultipleFieldsDocument.FIELD1, MultipleFieldsDocument.FIELD2)); + Arrays.asList(MultipleFieldsDocument.FIELD, MultipleFieldsDocument.FIELD1, MultipleFieldsDocument.FIELD2)); - MultipleFieldsDocument multipleFieldsDocument1 = new MultipleFieldsDocument("1", "This is field 1-1", "This is field 1-2", "This is field 1-3"); - MultipleFieldsDocument multipleFieldsDocument2 = new MultipleFieldsDocument("2", "This is field 2-1", "This is field 2-2", "This is field 2-3"); - MultipleFieldsDocument multipleFieldsDocument3 = new MultipleFieldsDocument("3", "This is field 3-1", "This is field 3-2", "This is field 3-3"); - commitDocuments(getDirectory(getIndexPath1AsFile()), multipleFieldsDocument1, multipleFieldsDocument2, multipleFieldsDocument3); + MultipleFieldsDocument multipleFieldsDocument1 = + new MultipleFieldsDocument("1", "This is field 1-1", "This is field 1-2", "This is field 1-3"); + MultipleFieldsDocument multipleFieldsDocument2 = + new MultipleFieldsDocument("2", "This is field 2-1", "This is field 2-2", "This is field 2-3"); + MultipleFieldsDocument multipleFieldsDocument3 = + new MultipleFieldsDocument("3", "This is field 3-1", "This is field 3-2", "This is field 3-3"); + commitDocuments(getDirectory(getIndexPath1AsFile()), multipleFieldsDocument1, + multipleFieldsDocument2, multipleFieldsDocument3); lucene2Seq.run(lucene2SeqConf); @@ -181,10 +177,8 @@ public class SequenceFilesFromLuceneStorageTest extends AbstractLuceneStorageTes @Test public void testRunNumericField() throws IOException { LuceneStorageConfiguration lucene2SeqConf = new LuceneStorageConfiguration(configuration, - asList(getIndexPath1()), - seqFilesOutputPath, - SingleFieldDocument.ID_FIELD, - asList(NumericFieldDocument.FIELD, NumericFieldDocument.NUMERIC_FIELD)); + Collections.singletonList(getIndexPath1()), seqFilesOutputPath, + SingleFieldDocument.ID_FIELD, Arrays.asList(NumericFieldDocument.FIELD, NumericFieldDocument.NUMERIC_FIELD)); NumericFieldDocument doc1 = new NumericFieldDocument("1", "This is field 1", 100); NumericFieldDocument doc2 = new NumericFieldDocument("2", "This is field 2", 200); @@ -206,10 +200,10 @@ public class SequenceFilesFromLuceneStorageTest extends AbstractLuceneStorageTes commitDocuments(getDirectory(getIndexPath1AsFile()), docs.subList(0, 500)); lucene2SeqConf = new LuceneStorageConfiguration(configuration, - asList(getIndexPath1()), + Collections.singletonList(getIndexPath1()), seqFilesOutputPath, "nonExistingField", - asList(SingleFieldDocument.FIELD)); + Collections.singletonList(SingleFieldDocument.FIELD)); lucene2Seq.run(lucene2SeqConf); } @@ -219,10 +213,10 @@ public class SequenceFilesFromLuceneStorageTest extends AbstractLuceneStorageTes commitDocuments(getDirectory(getIndexPath1AsFile()), docs.subList(0, 500)); lucene2SeqConf = new LuceneStorageConfiguration(configuration, - asList(getIndexPath1()), + Collections.singletonList(getIndexPath1()), seqFilesOutputPath, SingleFieldDocument.ID_FIELD, - asList(SingleFieldDocument.FIELD, "nonExistingField")); + Arrays.asList(SingleFieldDocument.FIELD, "nonExistingField")); lucene2Seq.run(lucene2SeqConf); } @@ -240,10 +234,10 @@ public class SequenceFilesFromLuceneStorageTest extends AbstractLuceneStorageTes commitDocuments(getDirectory(getIndexPath1AsFile()), document); lucene2SeqConf = new LuceneStorageConfiguration(configuration, - asList(getIndexPath1()), + Collections.singletonList(getIndexPath1()), seqFilesOutputPath, SingleFieldDocument.ID_FIELD, - asList(SingleFieldDocument.FIELD, "indexed")); + Arrays.asList(SingleFieldDocument.FIELD, "indexed")); lucene2Seq.run(lucene2SeqConf); } http://git-wip-us.apache.org/repos/asf/mahout/blob/85f9ece6/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java b/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java index 12c1451..ef2b8a6 100644 --- a/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java +++ b/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java @@ -20,8 +20,6 @@ import java.io.File; import java.io.FileOutputStream; import java.util.zip.GZIPOutputStream; -import com.google.common.io.Closeables; - import org.apache.commons.lang3.SystemUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; @@ -56,25 +54,18 @@ public final class SequenceFilesFromMailArchivesTest extends MahoutTestCase { File subDir = new File(inputDir, "subdir"); subDir.mkdir(); File gzFile = new File(subDir, "mail-messages.gz"); - GZIPOutputStream gzOut = null; - try { - gzOut = new GZIPOutputStream(new FileOutputStream(gzFile)); + try (GZIPOutputStream gzOut = new GZIPOutputStream(new FileOutputStream(gzFile))) { gzOut.write(testMailMessages.getBytes("UTF-8")); gzOut.finish(); - } finally { - Closeables.close(gzOut, false); } File subDir2 = new File(subDir, "subsubdir"); subDir2.mkdir(); File gzFile2 = new File(subDir2, "mail-messages-2.gz"); - try { - gzOut = new GZIPOutputStream(new FileOutputStream(gzFile2)); + try (GZIPOutputStream gzOut = new GZIPOutputStream(new FileOutputStream(gzFile2))) { gzOut.write(testMailMessages.getBytes("UTF-8")); gzOut.finish(); - } finally { - Closeables.close(gzOut, false); - } + } } @Test @@ -100,7 +91,7 @@ public final class SequenceFilesFromMailArchivesTest extends MahoutTestCase { Assert.assertTrue("Expected chunk file " + expectedChunkPath + " not found!", expectedChunkFile.isFile()); Configuration conf = getConfiguration(); - SequenceFileIterator<Text, Text> iterator = new SequenceFileIterator<Text, Text>(new Path(expectedChunkPath), true, conf); + SequenceFileIterator<Text, Text> iterator = new SequenceFileIterator<>(new Path(expectedChunkPath), true, conf); Assert.assertTrue("First key/value pair not found!", iterator.hasNext()); Pair<Text, Text> record = iterator.next(); @@ -155,7 +146,7 @@ public final class SequenceFilesFromMailArchivesTest extends MahoutTestCase { assertEquals(1, fileStatuses.length); // only one assertEquals("part-m-00000", fileStatuses[0].getPath().getName()); SequenceFileIterator<Text, Text> iterator = - new SequenceFileIterator<Text, Text>(mrOutputDir.suffix("/part-m-00000"), true, configuration); + new SequenceFileIterator<>(mrOutputDir.suffix("/part-m-00000"), true, configuration); Assert.assertTrue("First key/value pair not found!", iterator.hasNext()); Pair<Text, Text> record = iterator.next();
