MAHOUT-1782: Remove code for lucene2seq, this closes apache/mahout#170
Project: http://git-wip-us.apache.org/repos/asf/mahout/repo Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/82e78a8c Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/82e78a8c Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/82e78a8c Branch: refs/heads/master Commit: 82e78a8c9a1beee637ad381baf3ff9d4bb4297cc Parents: 708cc4f Author: smarthi <[email protected]> Authored: Tue Nov 3 20:30:37 2015 -0500 Committer: smarthi <[email protected]> Committed: Tue Nov 3 20:30:37 2015 -0500 ---------------------------------------------------------------------- .../mahout/text/LuceneIndexFileNameFilter.java | 62 ---- .../apache/mahout/text/LuceneIndexHelper.java | 41 --- .../mahout/text/LuceneSegmentInputFormat.java | 80 ----- .../mahout/text/LuceneSegmentInputSplit.java | 107 ------ .../mahout/text/LuceneSegmentRecordReader.java | 103 ------ .../apache/mahout/text/LuceneSeqFileHelper.java | 54 --- .../mahout/text/LuceneStorageConfiguration.java | 333 ----------------- .../text/ReadOnlyFileSystemDirectory.java | 355 ------------------- .../text/SequenceFilesFromLuceneStorage.java | 139 -------- .../SequenceFilesFromLuceneStorageDriver.java | 140 -------- .../SequenceFilesFromLuceneStorageMRJob.java | 66 ---- .../SequenceFilesFromLuceneStorageMapper.java | 83 ----- .../mahout/text/AbstractLuceneStorageTest.java | 107 ------ .../text/LuceneSegmentInputFormatTest.java | 85 ----- .../text/LuceneSegmentInputSplitTest.java | 88 ----- .../text/LuceneSegmentRecordReaderTest.java | 121 ------- .../text/LuceneStorageConfigurationTest.java | 49 --- ...equenceFilesFromLuceneStorageDriverTest.java | 174 --------- ...SequenceFilesFromLuceneStorageMRJobTest.java | 87 ----- .../SequenceFilesFromLuceneStorageTest.java | 244 ------------- src/conf/driver.classes.default.props | 1 - 21 files changed, 2519 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mahout/blob/82e78a8c/integration/src/main/java/org/apache/mahout/text/LuceneIndexFileNameFilter.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/LuceneIndexFileNameFilter.java b/integration/src/main/java/org/apache/mahout/text/LuceneIndexFileNameFilter.java deleted file mode 100644 index c505fcb..0000000 --- a/integration/src/main/java/org/apache/mahout/text/LuceneIndexFileNameFilter.java +++ /dev/null @@ -1,62 +0,0 @@ -package org.apache.mahout.text; -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.PathFilter; -import org.apache.lucene.index.IndexFileNames; - -import java.util.regex.Pattern; - -/** - * A wrapper class to convert an IndexFileNameFilter which implements - * java.io.FilenameFilter to an org.apache.hadoop.fs.PathFilter. - */ -@Deprecated -final class LuceneIndexFileNameFilter implements PathFilter { - - private static final LuceneIndexFileNameFilter LUCENE_INDEX_FILE_NAME_FILTER = new LuceneIndexFileNameFilter(); - - /** - * Get a static instance. - * - * @return the static instance - */ - public static LuceneIndexFileNameFilter getFilter() { - return LUCENE_INDEX_FILE_NAME_FILTER; - } - - private LuceneIndexFileNameFilter() {} - - //TODO: Lucene defines this in IndexFileNames, but it is package private, - // so make sure it doesn't change w/ new releases. - private static final Pattern CODEC_FILE_PATTERN = Pattern.compile("_[a-z0-9]+(_.*)?\\..*"); - - public boolean accept(Path path) { - String name = path.getName(); - if (CODEC_FILE_PATTERN.matcher(name).matches() || name.startsWith(IndexFileNames.SEGMENTS)) { - return true; - } - for (String extension : IndexFileNames.INDEX_EXTENSIONS) { - if (name.endsWith(extension)) { - return true; - } - } - return false; - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/82e78a8c/integration/src/main/java/org/apache/mahout/text/LuceneIndexHelper.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/LuceneIndexHelper.java b/integration/src/main/java/org/apache/mahout/text/LuceneIndexHelper.java deleted file mode 100644 index 465e51b..0000000 --- a/integration/src/main/java/org/apache/mahout/text/LuceneIndexHelper.java +++ /dev/null @@ -1,41 +0,0 @@ -package org.apache.mahout.text; -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexableField; - -import java.io.IOException; - -/** - * Utility for checking if a field is stored in a Lucene index. - */ -public class LuceneIndexHelper { - - private LuceneIndexHelper() { - - } - - public static void fieldShouldExistInIndex(IndexReader reader, String fieldName) throws IOException { - IndexableField field = reader.document(0).getField(fieldName); - if (field == null || !field.fieldType().stored()) { - throw new IllegalArgumentException("Field '" + fieldName + - "' is possibly not stored since first document in index does not contain this field."); - } - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/82e78a8c/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java b/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java deleted file mode 100644 index 1b5d717..0000000 --- a/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java +++ /dev/null @@ -1,80 +0,0 @@ -package org.apache.mahout.text; -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.InputFormat; -import org.apache.hadoop.mapreduce.InputSplit; -import org.apache.hadoop.mapreduce.JobContext; -import org.apache.hadoop.mapreduce.RecordReader; -import org.apache.hadoop.mapreduce.TaskAttemptContext; -import org.apache.lucene.index.SegmentCommitInfo; -import org.apache.lucene.index.SegmentInfos; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * {@link InputFormat} implementation which splits a Lucene index at the segment level. - */ -@Deprecated -public class LuceneSegmentInputFormat extends InputFormat { - - private static final Logger LOG = LoggerFactory.getLogger(LuceneSegmentInputFormat.class); - - @Override - public List<LuceneSegmentInputSplit> getSplits(JobContext context) throws IOException, InterruptedException { - Configuration configuration = context.getConfiguration(); - - LuceneStorageConfiguration lucene2SeqConfiguration = new LuceneStorageConfiguration(configuration); - - List<LuceneSegmentInputSplit> inputSplits = new ArrayList<>(); - - List<Path> indexPaths = lucene2SeqConfiguration.getIndexPaths(); - for (Path indexPath : indexPaths) { - ReadOnlyFileSystemDirectory directory = new ReadOnlyFileSystemDirectory(FileSystem.get(configuration), indexPath, - false, configuration); - SegmentInfos segmentInfos = new SegmentInfos(); - segmentInfos.read(directory); - - for (SegmentCommitInfo segmentInfo : segmentInfos) { - LuceneSegmentInputSplit inputSplit = new LuceneSegmentInputSplit(indexPath, segmentInfo.info.name, - segmentInfo.sizeInBytes()); - inputSplits.add(inputSplit); - LOG.info("Created {} byte input split for index '{}' segment {}", segmentInfo.sizeInBytes(), indexPath.toUri(), - segmentInfo.info.name); - } - } - - return inputSplits; - } - - @Override - public RecordReader<Text, NullWritable> createRecordReader(InputSplit inputSplit, TaskAttemptContext context) - throws IOException, InterruptedException { - LuceneSegmentRecordReader luceneSegmentRecordReader = new LuceneSegmentRecordReader(); - luceneSegmentRecordReader.initialize(inputSplit, context); - return luceneSegmentRecordReader; - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/82e78a8c/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputSplit.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputSplit.java b/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputSplit.java deleted file mode 100644 index 12949f5..0000000 --- a/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputSplit.java +++ /dev/null @@ -1,107 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - *3 - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.text; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.mapreduce.InputSplit; -import org.apache.lucene.index.SegmentCommitInfo; -import org.apache.lucene.index.SegmentInfo; -import org.apache.lucene.index.SegmentInfos; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -/** - * {@link InputSplit} implementation that represents a Lucene segment. - */ -@Deprecated -public class LuceneSegmentInputSplit extends InputSplit implements Writable { - - private Path indexPath; - private String segmentInfoName; - private long length; - - public LuceneSegmentInputSplit() { - // For deserialization - } - - public LuceneSegmentInputSplit(Path indexPath, String segmentInfoName, long length) { - this.indexPath = indexPath; - this.segmentInfoName = segmentInfoName; - this.length = length; - } - - @Override - public long getLength() throws IOException, InterruptedException { - return length; - } - - @Override - public String[] getLocations() throws IOException, InterruptedException { - return new String[]{}; - } - - public String getSegmentInfoName() { - return segmentInfoName; - } - - public Path getIndexPath() { - return indexPath; - } - - @Override - public void write(DataOutput out) throws IOException { - out.writeUTF(indexPath.toString()); - out.writeUTF(segmentInfoName); - out.writeLong(length); - } - - @Override - public void readFields(DataInput in) throws IOException { - this.indexPath = new Path(in.readUTF()); - this.segmentInfoName = in.readUTF(); - this.length = in.readLong(); - } - - /** - * Get the {@link SegmentInfo} of this {@link InputSplit} via the given {@link Configuration} - * - * @param configuration the configuration used to locate the index - * @return the segment info or throws exception if not found - * @throws IOException if an error occurs when accessing the directory - */ - public SegmentCommitInfo getSegment(Configuration configuration) throws IOException { - ReadOnlyFileSystemDirectory directory = new ReadOnlyFileSystemDirectory(FileSystem.get(configuration), indexPath, - false, configuration); - - SegmentInfos segmentInfos = new SegmentInfos(); - segmentInfos.read(directory); - - for (SegmentCommitInfo segmentInfo : segmentInfos) { - if (segmentInfo.info.name.equals(segmentInfoName)) { - return segmentInfo; - } - } - - throw new IllegalArgumentException("No such segment: '" + segmentInfoName - + "' in directory " + directory.toString()); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/82e78a8c/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java b/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java deleted file mode 100644 index 66d37f7..0000000 --- a/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java +++ /dev/null @@ -1,103 +0,0 @@ -package org.apache.mahout.text; -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.InputSplit; -import org.apache.hadoop.mapreduce.RecordReader; -import org.apache.hadoop.mapreduce.TaskAttemptContext; -import org.apache.lucene.index.SegmentCommitInfo; -import org.apache.lucene.index.SegmentReader; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.Weight; -import org.apache.lucene.store.IOContext; - -import java.io.IOException; - -/** - * {@link RecordReader} implementation for Lucene segments. Each {@link InputSplit} contains a separate Lucene segment. - * Emits records consisting of a {@link Text} document ID and a null key. - */ -@Deprecated -public class LuceneSegmentRecordReader extends RecordReader<Text, NullWritable> { - - public static final int USE_TERM_INFO = 1; - - private SegmentReader segmentReader; - private Scorer scorer; - - private int nextDocId; - private Text key = new Text(); - - @Override - public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { - LuceneSegmentInputSplit inputSplit = (LuceneSegmentInputSplit) split; - - Configuration configuration = context.getConfiguration(); - LuceneStorageConfiguration lucene2SeqConfiguration = new LuceneStorageConfiguration(configuration); - - SegmentCommitInfo segmentInfo = inputSplit.getSegment(configuration); - segmentReader = new SegmentReader(segmentInfo, USE_TERM_INFO, IOContext.READ); - - - IndexSearcher searcher = new IndexSearcher(segmentReader); - String idField = lucene2SeqConfiguration.getIdField(); - LuceneIndexHelper.fieldShouldExistInIndex(segmentReader, idField); - for (String field : lucene2SeqConfiguration.getFields()) { - LuceneIndexHelper.fieldShouldExistInIndex(segmentReader, field); - } - - Weight weight = lucene2SeqConfiguration.getQuery().createWeight(searcher); - scorer = weight.scorer(segmentReader.getContext(), false, false, null); - if (scorer == null) { - throw new IllegalArgumentException("Could not create query scorer for query: " - + lucene2SeqConfiguration.getQuery()); - } - } - - @Override - public boolean nextKeyValue() throws IOException, InterruptedException { - nextDocId = scorer.nextDoc(); - - return nextDocId != Scorer.NO_MORE_DOCS; - } - - @Override - public Text getCurrentKey() throws IOException, InterruptedException { - key.set(String.valueOf(nextDocId)); - return key; - } - - @Override - public NullWritable getCurrentValue() throws IOException, InterruptedException { - return NullWritable.get(); - } - - @Override - public float getProgress() throws IOException, InterruptedException { - //this is a rough estimate, due to the possible inaccuracies of cost - return scorer.cost() == 0 ? 0 : (float) nextDocId / scorer.cost(); - } - - @Override - public void close() throws IOException { - segmentReader.close(); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/82e78a8c/integration/src/main/java/org/apache/mahout/text/LuceneSeqFileHelper.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/LuceneSeqFileHelper.java b/integration/src/main/java/org/apache/mahout/text/LuceneSeqFileHelper.java deleted file mode 100644 index e6dc84a..0000000 --- a/integration/src/main/java/org/apache/mahout/text/LuceneSeqFileHelper.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.text; - -import com.google.common.base.Strings; -import org.apache.hadoop.io.Text; -import org.apache.lucene.document.Document; - -import java.util.List; - -import static org.apache.commons.lang.StringUtils.isNotBlank; - -/** - * - * - **/ -@Deprecated -class LuceneSeqFileHelper { - - public static final String SEPARATOR_FIELDS = " "; - public static final int USE_TERM_INFOS = 1; - - private LuceneSeqFileHelper() {} - - public static void populateValues(Document document, Text theValue, List<String> fields) { - - StringBuilder valueBuilder = new StringBuilder(); - for (int i = 0; i < fields.size(); i++) { - String field = fields.get(i); - String fieldValue = document.get(field); - if (isNotBlank(fieldValue)) { - valueBuilder.append(fieldValue); - if (i != fields.size() - 1) { - valueBuilder.append(SEPARATOR_FIELDS); - } - } - } - theValue.set(Strings.nullToEmpty(valueBuilder.toString())); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/82e78a8c/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java b/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java deleted file mode 100644 index 735fb5d..0000000 --- a/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java +++ /dev/null @@ -1,333 +0,0 @@ -package org.apache.mahout.text; -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.Set; - -import com.google.common.base.Preconditions; -import org.apache.commons.lang.StringUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.DefaultStringifier; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.document.DocumentStoredFieldVisitor; -import org.apache.lucene.queryparser.classic.ParseException; -import org.apache.lucene.queryparser.classic.QueryParser; -import org.apache.lucene.search.MatchAllDocsQuery; -import org.apache.lucene.search.Query; -import org.apache.mahout.common.Pair; -import org.apache.mahout.common.iterator.sequencefile.PathFilters; -import org.apache.mahout.common.iterator.sequencefile.PathType; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable; - -import static org.apache.lucene.util.Version.LUCENE_46; - -/** - * Holds all the configuration for {@link SequenceFilesFromLuceneStorage}, which generates a sequence file - * with id as the key and a content field as value. - */ -@Deprecated -public class LuceneStorageConfiguration implements Writable { - - private static final Query DEFAULT_QUERY = new MatchAllDocsQuery(); - private static final int DEFAULT_MAX_HITS = Integer.MAX_VALUE; - - static final String KEY = "org.apache.mahout.text.LuceneIndexToSequenceFiles"; - - static final String SEPARATOR_FIELDS = ","; - static final String SEPARATOR_PATHS = ","; - - private Configuration configuration; - private List<Path> indexPaths; - private Path sequenceFilesOutputPath; - private String idField; - private List<String> fields; - private Query query; - private int maxHits; - - /** - * Create a configuration bean with all mandatory parameters. - * - * @param configuration Hadoop configuration for writing sequencefiles - * @param indexPaths paths to the index - * @param sequenceFilesOutputPath path to output the sequence file - * @param idField field used for the key of the sequence file - * @param fields field(s) used for the value of the sequence file - */ - public LuceneStorageConfiguration(Configuration configuration, List<Path> indexPaths, Path sequenceFilesOutputPath, - String idField, List<String> fields) { - Preconditions.checkArgument(configuration != null, "Parameter 'configuration' cannot be null"); - Preconditions.checkArgument(indexPaths != null, "Parameter 'indexPaths' cannot be null"); - Preconditions.checkArgument(indexPaths != null && !indexPaths.isEmpty(), "Parameter 'indexPaths' cannot be empty"); - Preconditions.checkArgument(sequenceFilesOutputPath != null, "Parameter 'sequenceFilesOutputPath' cannot be null"); - Preconditions.checkArgument(idField != null, "Parameter 'idField' cannot be null"); - Preconditions.checkArgument(fields != null, "Parameter 'fields' cannot be null"); - Preconditions.checkArgument(fields != null && !fields.isEmpty(), "Parameter 'fields' cannot be empty"); - - this.configuration = configuration; - this.indexPaths = indexPaths; - this.sequenceFilesOutputPath = sequenceFilesOutputPath; - this.idField = idField; - this.fields = fields; - - this.query = DEFAULT_QUERY; - this.maxHits = DEFAULT_MAX_HITS; - } - - public LuceneStorageConfiguration() { - // Used during serialization. Do not use. - } - - /** - * Deserializes a {@link LuceneStorageConfiguration} from a {@link Configuration}. - * - * @param conf the {@link Configuration} object with a serialized {@link LuceneStorageConfiguration} - * @throws IOException if deserialization fails - */ - public LuceneStorageConfiguration(Configuration conf) throws IOException { - Preconditions.checkNotNull(conf, "Parameter 'configuration' cannot be null"); - - String serializedConfigString = conf.get(KEY); - - if (serializedConfigString == null) { - throw new IllegalArgumentException("Parameter 'configuration' does not contain a serialized " + this.getClass()); - } - - LuceneStorageConfiguration luceneStorageConf = DefaultStringifier.load(conf, KEY, LuceneStorageConfiguration.class); - - this.configuration = conf; - this.indexPaths = luceneStorageConf.getIndexPaths(); - this.sequenceFilesOutputPath = luceneStorageConf.getSequenceFilesOutputPath(); - this.idField = luceneStorageConf.getIdField(); - this.fields = luceneStorageConf.getFields(); - this.query = luceneStorageConf.getQuery(); - this.maxHits = luceneStorageConf.getMaxHits(); - } - - /** - * Serializes this object in a Hadoop {@link Configuration} - * - * @return a {@link Configuration} object with a String serialization - * @throws IOException if serialization fails - */ - public Configuration serialize() throws IOException { - DefaultStringifier.store(configuration, this, KEY); - - return new Configuration(configuration); - } - - /** - * Returns an {@link Iterator} which returns (Text, Text) {@link Pair}s of the produced sequence files. - * - * @return iterator - */ - public Iterator<Pair<Text, Text>> getSequenceFileIterator() { - return new SequenceFileDirIterable<Text, Text>(sequenceFilesOutputPath, PathType.LIST, PathFilters.logsCRCFilter(), - configuration).iterator(); - } - - public Configuration getConfiguration() { - return configuration; - } - - public Path getSequenceFilesOutputPath() { - return sequenceFilesOutputPath; - } - - public List<Path> getIndexPaths() { - return indexPaths; - } - - public String getIdField() { - return idField; - } - - public List<String> getFields() { - return fields; - } - - public void setQuery(Query query) { - this.query = query; - } - - public Query getQuery() { - return query; - } - - public void setMaxHits(int maxHits) { - this.maxHits = maxHits; - } - - public int getMaxHits() { - return maxHits; - } - - public DocumentStoredFieldVisitor getStoredFieldVisitor() { - Set<String> fieldSet = new HashSet<>(Collections.singleton(idField)); - fieldSet.addAll(fields); - return new DocumentStoredFieldVisitor(fieldSet); - } - - @Override - public void write(DataOutput out) throws IOException { - out.writeUTF(sequenceFilesOutputPath.toString()); - out.writeUTF(StringUtils.join(indexPaths, SEPARATOR_PATHS)); - out.writeUTF(idField); - out.writeUTF(StringUtils.join(fields, SEPARATOR_FIELDS)); - out.writeUTF(query.toString()); - out.writeInt(maxHits); - } - - @Override - public void readFields(DataInput in) throws IOException { - try { - sequenceFilesOutputPath = new Path(in.readUTF()); - indexPaths = new ArrayList<>(); - String[] indexPaths = in.readUTF().split(SEPARATOR_PATHS); - for (String indexPath : indexPaths) { - this.indexPaths.add(new Path(indexPath)); - } - idField = in.readUTF(); - fields = Arrays.asList(in.readUTF().split(SEPARATOR_FIELDS)); - query = new QueryParser(LUCENE_46, "query", new StandardAnalyzer(LUCENE_46)).parse(in.readUTF()); - maxHits = in.readInt(); - } catch (ParseException e) { - throw new RuntimeException("Could not deserialize " + this.getClass().getName(), e); - } - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - - LuceneStorageConfiguration that = (LuceneStorageConfiguration) o; - - if (maxHits != that.maxHits) { - return false; - } - if (fields != null ? !fields.equals(that.fields) : that.fields != null) { - return false; - } - if (idField != null) { - if (!idField.equals(that.idField)) { - return false; - } else { - if (indexPaths != null) { - if (query != null) { - if (sequenceFilesOutputPath != null) { - return indexPaths.equals(that.indexPaths) && sequenceFilesOutputPath.equals(that.sequenceFilesOutputPath) && query.equals(that.query); - } else { - return indexPaths.equals(that.indexPaths) && that.sequenceFilesOutputPath == null && query.equals(that.query); - } - } else { - // query == null - if (that.query == null && indexPaths.equals(that.indexPaths)) { - if (sequenceFilesOutputPath != null) { - return sequenceFilesOutputPath.equals(that.sequenceFilesOutputPath); - } else { - return that.sequenceFilesOutputPath == null; - } - } else { - return false; - } - } - } else { - // indexPaths == null - if (that.indexPaths == null) { - if (query != null) { - if (sequenceFilesOutputPath != null) { - return sequenceFilesOutputPath.equals(that.sequenceFilesOutputPath) && query.equals(that.query); - } else { - return that.sequenceFilesOutputPath == null && query.equals(that.query); - } - } else { - if (that.query == null) { - if (sequenceFilesOutputPath != null) { - return sequenceFilesOutputPath.equals(that.sequenceFilesOutputPath); - } else { - return that.sequenceFilesOutputPath == null; - } - } else { - return false; - } - } - } else { - return false; - } - } - } - } else { - if (that.idField != null) { - return false; - } else { - if (indexPaths != null) { - if (query != null) { - if (sequenceFilesOutputPath != null) { - return !!indexPaths.equals(that.indexPaths) && !!query.equals(that.query) && !!sequenceFilesOutputPath.equals(that.sequenceFilesOutputPath); - } else { - return !!indexPaths.equals(that.indexPaths) && !!query.equals(that.query) && !(that.sequenceFilesOutputPath != null); - } - } else { - if (sequenceFilesOutputPath != null) { - return !!indexPaths.equals(that.indexPaths) && !(that.query != null) && !!sequenceFilesOutputPath.equals(that.sequenceFilesOutputPath); - } else { - return !!indexPaths.equals(that.indexPaths) && !(that.query != null) && !(that.sequenceFilesOutputPath != null); - } - } - } else { - if (query != null) { - if (sequenceFilesOutputPath != null) { - return that.indexPaths == null && query.equals(that.query) && sequenceFilesOutputPath.equals(that.sequenceFilesOutputPath); - } else { - return that.indexPaths == null && query.equals(that.query) && that.sequenceFilesOutputPath == null; - } - } else { - return that.indexPaths == null && that.query == null && (sequenceFilesOutputPath != null ? sequenceFilesOutputPath.equals(that.sequenceFilesOutputPath) : that.sequenceFilesOutputPath == null); - } - } - } - } - - } - - @Override - public int hashCode() { - int result = indexPaths != null ? indexPaths.hashCode() : 0; - result = 31 * result + (sequenceFilesOutputPath != null ? sequenceFilesOutputPath.hashCode() : 0); - result = 31 * result + (idField != null ? idField.hashCode() : 0); - result = 31 * result + (fields != null ? fields.hashCode() : 0); - result = 31 * result + (query != null ? query.hashCode() : 0); - result = 31 * result + maxHits; - return result; - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/82e78a8c/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java b/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java deleted file mode 100644 index cd8137f..0000000 --- a/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java +++ /dev/null @@ -1,355 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.text; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.lucene.store.BaseDirectory; -import org.apache.lucene.store.BufferedIndexInput; -import org.apache.lucene.store.BufferedIndexOutput; -import org.apache.lucene.store.IOContext; -import org.apache.lucene.store.IndexInput; -import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.store.Lock; -import org.apache.lucene.store.LockFactory; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.Collection; - -//TODO: is there a better way of doing this in Lucene 4.x? - -/** - * This class implements a read-only Lucene Directory on top of a general FileSystem. - * Currently it does not support locking. - * <p/> - * // TODO: Rename to FileSystemReadOnlyDirectory - */ -@Deprecated -public class ReadOnlyFileSystemDirectory extends BaseDirectory { - - private final FileSystem fs; - private final Path directory; - private final int ioFileBufferSize; - - private static final Logger log = LoggerFactory.getLogger(ReadOnlyFileSystemDirectory.class); - - /** - * Constructor - * - * @param fs - filesystem - * @param directory - directory path - * @param create - if true create the directory - * @param conf - MR Job Configuration - * @throws IOException - */ - - public ReadOnlyFileSystemDirectory(FileSystem fs, Path directory, boolean create, - Configuration conf) throws IOException { - - this.fs = fs; - this.directory = directory; - this.ioFileBufferSize = conf.getInt("io.file.buffer.size", 4096); - - if (create) { - create(); - } - - boolean isDir = false; - try { - FileStatus status = fs.getFileStatus(directory); - if (status != null) { - isDir = status.isDir(); - } - } catch (IOException e) { - log.error(e.getMessage(), e); - } - if (!isDir) { - throw new IOException(directory + " is not a directory"); - } - } - - - private void create() throws IOException { - if (!fs.exists(directory)) { - fs.mkdirs(directory); - } - - boolean isDir = false; - try { - FileStatus status = fs.getFileStatus(directory); - if (status != null) { - isDir = status.isDir(); - } - } catch (IOException e) { - log.error(e.getMessage(), e); - } - if (!isDir) { - throw new IOException(directory + " is not a directory"); - } - - // clear old index files - FileStatus[] fileStatus = - fs.listStatus(directory, LuceneIndexFileNameFilter.getFilter()); - for (FileStatus status : fileStatus) { - if (!fs.delete(status.getPath(), true)) { - throw new IOException("Cannot delete index file " - + status.getPath()); - } - } - } - - public String[] list() throws IOException { - FileStatus[] fileStatus = - fs.listStatus(directory, LuceneIndexFileNameFilter.getFilter()); - String[] result = new String[fileStatus.length]; - for (int i = 0; i < fileStatus.length; i++) { - result[i] = fileStatus[i].getPath().getName(); - } - return result; - } - - @Override - public String[] listAll() throws IOException { - return list(); - } - - @Override - public boolean fileExists(String name) throws IOException { - return fs.exists(new Path(directory, name)); - } - - @Override - public long fileLength(String name) throws IOException { - return fs.getFileStatus(new Path(directory, name)).getLen(); - } - - @Override - public void deleteFile(String name) throws IOException { - if (!fs.delete(new Path(directory, name), true)) { - throw new IOException("Cannot delete index file " + name); - } - } - - @Override - public IndexOutput createOutput(String name, IOContext context) throws IOException { - //TODO: What should we be doing with the IOContext here, if anything? - Path file = new Path(directory, name); - if (fs.exists(file) && !fs.delete(file, true)) { - // delete the existing one if applicable - throw new IOException("Cannot overwrite index file " + file); - } - - return new FileSystemIndexOutput(file, ioFileBufferSize); - } - - @Override - public void sync(Collection<String> names) throws IOException { - // do nothing, as this is read-only - } - - @Override - public IndexInput openInput(String name, IOContext context) throws IOException { - return new FileSystemIndexInput(new Path(directory, name), ioFileBufferSize); - } - - @Override - public Lock makeLock(final String name) { - return new Lock() { - public boolean obtain() { - return true; - } - - public void release() { - } - - public boolean isLocked() { - throw new UnsupportedOperationException(); - } - - public String toString() { - return "Lock@" + new Path(directory, name); - } - }; - } - - @Override - public void clearLock(String name) throws IOException { - // do nothing - } - - @Override - public void close() throws IOException { - // do not close the file system - } - - @Override - public void setLockFactory(LockFactory lockFactory) throws IOException { - // do nothing - } - - @Override - public LockFactory getLockFactory() { - return null; - } - - @Override - public String toString() { - return this.getClass().getName() + "@" + directory; - } - - private class FileSystemIndexInput extends BufferedIndexInput implements Cloneable { - - // shared by clones - private class Descriptor { - public final FSDataInputStream in; - public long position; // cache of in.getPos() - - public Descriptor(Path file, int ioFileBufferSize) throws IOException { - this.in = fs.open(file, ioFileBufferSize); - } - } - - private final Path filePath; // for debugging - private final Descriptor descriptor; - private final long length; - private boolean isOpen; - private boolean isClone; - - public FileSystemIndexInput(Path path, int ioFileBufferSize) - throws IOException { - super("FSII_" + path.getName(), ioFileBufferSize); - filePath = path; - descriptor = new Descriptor(path, ioFileBufferSize); - length = fs.getFileStatus(path).getLen(); - isOpen = true; - } - - @Override - protected void readInternal(byte[] b, int offset, int len) - throws IOException { - long position = getFilePointer(); - if (position != descriptor.position) { - descriptor.in.seek(position); - descriptor.position = position; - } - int total = 0; - do { - int i = descriptor.in.read(b, offset + total, len - total); - if (i == -1) { - throw new IOException("Read past EOF"); - } - descriptor.position += i; - total += i; - } while (total < len); - } - - @Override - public void close() throws IOException { - if (!isClone) { - if (isOpen) { - descriptor.in.close(); - isOpen = false; - } else { - throw new IOException("Index file " + filePath + " already closed"); - } - } - } - - @Override - protected void seekInternal(long position) { - // handled in readInternal() - } - - @Override - public long length() { - return length; - } - - @Override - protected void finalize() throws Throwable { - super.finalize(); - if (!isClone && isOpen) { - close(); // close the file - } - } - - @Override - public BufferedIndexInput clone() { - FileSystemIndexInput clone = (FileSystemIndexInput) super.clone(); - clone.isClone = true; - return clone; - } - } - - private class FileSystemIndexOutput extends BufferedIndexOutput { - - private final Path filePath; // for debugging - private final FSDataOutputStream out; - private boolean isOpen; - - public FileSystemIndexOutput(Path path, int ioFileBufferSize) - throws IOException { - filePath = path; - // overwrite is true by default - out = fs.create(path, true, ioFileBufferSize); - isOpen = true; - } - - @Override - public void flushBuffer(byte[] b, int offset, int size) throws IOException { - out.write(b, offset, size); - } - - @Override - public void close() throws IOException { - if (isOpen) { - super.close(); - out.close(); - isOpen = false; - } else { - throw new IOException("Index file " + filePath + " already closed"); - } - } - - @Override - public void seek(long pos) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public long length() throws IOException { - return out.getPos(); - } - - @Override - protected void finalize() throws Throwable { - super.finalize(); - if (isOpen) { - close(); // close the file - } - } - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/82e78a8c/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java b/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java deleted file mode 100644 index 84953c2..0000000 --- a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java +++ /dev/null @@ -1,139 +0,0 @@ -package org.apache.mahout.text; -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.File; -import java.io.IOException; -import java.util.List; - -import com.google.common.base.Strings; -import com.google.common.io.Closeables; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Text; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.DocumentStoredFieldVisitor; -import org.apache.lucene.index.AtomicReaderContext; -import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.search.Collector; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Scorer; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.FSDirectory; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import static org.apache.commons.lang.StringUtils.isBlank; - -/** - * Generates a sequence file from a Lucene index with a specified id field as the key and a content field as the value. - * Configure this class with a {@link LuceneStorageConfiguration} bean. - */ -@Deprecated -public class SequenceFilesFromLuceneStorage { - private static final Logger log = LoggerFactory.getLogger(SequenceFilesFromLuceneStorage.class); - - /** - * Generates a sequence files from a Lucene index via the given {@link LuceneStorageConfiguration} - * - * @param lucene2seqConf configuration bean - * @throws java.io.IOException if index cannot be opened or sequence file could not be written - */ - public void run(final LuceneStorageConfiguration lucene2seqConf) throws IOException { - List<Path> indexPaths = lucene2seqConf.getIndexPaths(); - int processedDocs = 0; - - for (Path indexPath : indexPaths) { - Directory directory = FSDirectory.open(new File(indexPath.toUri().getPath())); - IndexReader reader = DirectoryReader.open(directory); - IndexSearcher searcher = new IndexSearcher(reader); - - LuceneIndexHelper.fieldShouldExistInIndex(reader, lucene2seqConf.getIdField()); - for (String field : lucene2seqConf.getFields()) { - LuceneIndexHelper.fieldShouldExistInIndex(reader, field); - } - - Configuration configuration = lucene2seqConf.getConfiguration(); - FileSystem fileSystem = FileSystem.get(configuration); - Path sequenceFilePath = new Path(lucene2seqConf.getSequenceFilesOutputPath(), indexPath.getName()); - final SequenceFile.Writer sequenceFileWriter = new SequenceFile.Writer(fileSystem, configuration, - sequenceFilePath, Text.class, Text.class); - - SeqFileWriterCollector writerCollector = new SeqFileWriterCollector(lucene2seqConf, sequenceFileWriter, - processedDocs); - searcher.search(lucene2seqConf.getQuery(), writerCollector); - log.info("Wrote " + writerCollector.processedDocs + " documents in " + sequenceFilePath.toUri()); - processedDocs = writerCollector.processedDocs; - Closeables.close(sequenceFileWriter, false); - directory.close(); - //searcher.close(); - reader.close(); - } - } - - private static class SeqFileWriterCollector extends Collector { - private final LuceneStorageConfiguration lucene2seqConf; - private final SequenceFile.Writer sequenceFileWriter; - public int processedDocs; - AtomicReaderContext arc; - - SeqFileWriterCollector(LuceneStorageConfiguration lucene2seqConf, SequenceFile.Writer sequenceFileWriter, - int processedDocs) { - this.lucene2seqConf = lucene2seqConf; - this.sequenceFileWriter = sequenceFileWriter; - this.processedDocs = processedDocs; - } - - @Override - public void setScorer(Scorer scorer) throws IOException { - //don't care about scoring, we just want the matches - } - - @Override - public void collect(int docNum) throws IOException { - if (processedDocs < lucene2seqConf.getMaxHits()) { - final DocumentStoredFieldVisitor storedFieldVisitor = lucene2seqConf.getStoredFieldVisitor(); - arc.reader().document(docNum, storedFieldVisitor); - - Document doc = storedFieldVisitor.getDocument(); - List<String> fields = lucene2seqConf.getFields(); - Text theKey = new Text(Strings.nullToEmpty(doc.get(lucene2seqConf.getIdField()))); - Text theValue = new Text(); - LuceneSeqFileHelper.populateValues(doc, theValue, fields); - //if they are both empty, don't write - if (isBlank(theKey.toString()) && isBlank(theValue.toString())) { - return; - } - sequenceFileWriter.append(theKey, theValue); - processedDocs++; - } - } - - @Override - public void setNextReader(AtomicReaderContext context) throws IOException { - arc = context; - } - - @Override - public boolean acceptsDocsOutOfOrder() { - return true; - } - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/82e78a8c/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java b/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java deleted file mode 100644 index 9685b85..0000000 --- a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java +++ /dev/null @@ -1,140 +0,0 @@ -package org.apache.mahout.text; -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.regex.Pattern; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.util.ToolRunner; -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.queryparser.classic.ParseException; -import org.apache.lucene.queryparser.classic.QueryParser; -import org.apache.lucene.search.MatchAllDocsQuery; -import org.apache.lucene.search.Query; -import org.apache.lucene.util.Version; -import org.apache.mahout.common.AbstractJob; -import org.apache.mahout.common.commandline.DefaultOptionCreator; - -/** - * Driver class for the lucene2seq program. Converts text contents of stored fields of a lucene index into a Hadoop - * SequenceFile. The key of the sequence file is the document ID and the value is the concatenated text of the specified - * stored field(s). - * - * Deprecated as of Mahout 0.11.0 - */ - -@Deprecated -public class SequenceFilesFromLuceneStorageDriver extends AbstractJob { - - static final String OPTION_ID_FIELD = "idField"; - static final String OPTION_FIELD = "fields"; - static final String OPTION_QUERY = "query"; - static final String OPTION_MAX_HITS = "maxHits"; - - static final Query DEFAULT_QUERY = new MatchAllDocsQuery(); - static final int DEFAULT_MAX_HITS = Integer.MAX_VALUE; - - static final String SEPARATOR_FIELDS = ","; - static final String QUERY_DELIMITER = "'"; - private static final Pattern COMPILE = Pattern.compile(QUERY_DELIMITER); - - public static void main(String[] args) throws Exception { - ToolRunner.run(new SequenceFilesFromLuceneStorageDriver(), args); - } - - @Override - public int run(String[] args) throws Exception { - addOutputOption(); - addInputOption(); - //addOption(OPTION_LUCENE_DIRECTORY, "d", "Lucene directory / directories. Comma separated.", true); - addOption(OPTION_ID_FIELD, "id", "The field in the index containing the id", true); - addOption(OPTION_FIELD, "f", "The stored field(s) in the index containing text", true); - - addOption(OPTION_QUERY, "q", "(Optional) Lucene query. Defaults to " + DEFAULT_QUERY.getClass().getSimpleName()); - addOption(OPTION_MAX_HITS, "n", "(Optional) Max hits. Defaults to " + DEFAULT_MAX_HITS); - addOption(DefaultOptionCreator.methodOption().create()); - - if (parseArguments(args) == null) { - return -1; - } - - Configuration configuration = getConf(); - - String[] paths = getInputPath().toString().split(","); - List<Path> indexPaths = new ArrayList<>(); - for (String path : paths) { - indexPaths.add(new Path(path)); - } - - Path sequenceFilesOutputPath = getOutputPath(); - - String idField = getOption(OPTION_ID_FIELD); - String fields = getOption(OPTION_FIELD); - - LuceneStorageConfiguration lucene2SeqConf = newLucene2SeqConfiguration(configuration, - indexPaths, - sequenceFilesOutputPath, - idField, - Arrays.asList(fields.split(SEPARATOR_FIELDS))); - - Query query = DEFAULT_QUERY; - if (hasOption(OPTION_QUERY)) { - try { - String queryString = COMPILE.matcher(getOption(OPTION_QUERY)).replaceAll(""); - QueryParser queryParser = new QueryParser(Version.LUCENE_46, queryString, - new StandardAnalyzer(Version.LUCENE_46)); - query = queryParser.parse(queryString); - } catch (ParseException e) { - throw new IllegalArgumentException(e.getMessage(), e); - } - } - lucene2SeqConf.setQuery(query); - - int maxHits = DEFAULT_MAX_HITS; - if (hasOption(OPTION_MAX_HITS)) { - String maxHitsString = getOption(OPTION_MAX_HITS); - maxHits = Integer.valueOf(maxHitsString); - } - lucene2SeqConf.setMaxHits(maxHits); - - if (hasOption(DefaultOptionCreator.METHOD_OPTION) - && getOption(DefaultOptionCreator.METHOD_OPTION).equals("sequential")) { - new SequenceFilesFromLuceneStorage().run(lucene2SeqConf); - } else { - new SequenceFilesFromLuceneStorageMRJob().run(lucene2SeqConf); - } - return 0; - } - - public LuceneStorageConfiguration newLucene2SeqConfiguration(Configuration configuration, - List<Path> indexPaths, - Path sequenceFilesOutputPath, - String idField, - List<String> fields) { - return new LuceneStorageConfiguration( - configuration, - indexPaths, - sequenceFilesOutputPath, - idField, - fields); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/82e78a8c/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMRJob.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMRJob.java b/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMRJob.java deleted file mode 100644 index 787bf15..0000000 --- a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMRJob.java +++ /dev/null @@ -1,66 +0,0 @@ -package org.apache.mahout.text; -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import com.google.common.base.Joiner; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; - -import java.io.IOException; - -/** - * Generates a sequence file from a Lucene index via MapReduce. Uses a specified id field as the key and a content field - * as the value. Configure this class with a {@link LuceneStorageConfiguration} bean. - */ -@Deprecated -public class SequenceFilesFromLuceneStorageMRJob { - - public void run(LuceneStorageConfiguration lucene2seqConf) { - try { - Configuration configuration = lucene2seqConf.serialize(); - - Job job = new Job(configuration, "LuceneIndexToSequenceFiles: " + lucene2seqConf.getIndexPaths() + " -> M/R -> " - + lucene2seqConf.getSequenceFilesOutputPath()); - - job.setMapOutputKeyClass(Text.class); - job.setMapOutputValueClass(Text.class); - - job.setOutputKeyClass(Text.class); - job.setOutputValueClass(Text.class); - - job.setOutputFormatClass(SequenceFileOutputFormat.class); - - job.setMapperClass(SequenceFilesFromLuceneStorageMapper.class); - - job.setInputFormatClass(LuceneSegmentInputFormat.class); - - FileInputFormat.setInputPaths(job, Joiner.on(',').skipNulls().join(lucene2seqConf.getIndexPaths().iterator())); - FileOutputFormat.setOutputPath(job, lucene2seqConf.getSequenceFilesOutputPath()); - - job.setJarByClass(SequenceFilesFromLuceneStorageMRJob.class); - job.setNumReduceTasks(0); - - job.waitForCompletion(true); - } catch (IOException | InterruptedException | ClassNotFoundException e) { - throw new RuntimeException(e); - } - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/82e78a8c/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMapper.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMapper.java b/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMapper.java deleted file mode 100644 index 5feceef..0000000 --- a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMapper.java +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.text; - -import com.google.common.base.Strings; -import org.apache.commons.lang3.StringUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.DocumentStoredFieldVisitor; -import org.apache.lucene.index.SegmentCommitInfo; -import org.apache.lucene.index.SegmentReader; -import org.apache.lucene.store.IOContext; - -import java.io.IOException; -import java.util.List; - -/** - * Maps document IDs to key value pairs with ID field as the key and the concatenated stored field(s) - * as value. - */ -@Deprecated -public class SequenceFilesFromLuceneStorageMapper extends Mapper<Text, NullWritable, Text, Text> { - - public enum DataStatus { EMPTY_KEY, EMPTY_VALUE, EMPTY_BOTH } - - private LuceneStorageConfiguration l2sConf; - private SegmentReader segmentReader; - - @Override - protected void setup(Context context) throws IOException, InterruptedException { - Configuration configuration = context.getConfiguration(); - l2sConf = new LuceneStorageConfiguration(configuration); - LuceneSegmentInputSplit inputSplit = (LuceneSegmentInputSplit) context.getInputSplit(); - SegmentCommitInfo segmentInfo = inputSplit.getSegment(configuration); - segmentReader = new SegmentReader(segmentInfo, LuceneSeqFileHelper.USE_TERM_INFOS, IOContext.READ); - } - - @Override - protected void map(Text key, NullWritable text, Context context) throws IOException, InterruptedException { - int docId = Integer.valueOf(key.toString()); - DocumentStoredFieldVisitor storedFieldVisitor = l2sConf.getStoredFieldVisitor(); - segmentReader.document(docId, storedFieldVisitor); - Document document = storedFieldVisitor.getDocument(); - List<String> fields = l2sConf.getFields(); - Text theKey = new Text(Strings.nullToEmpty(document.get(l2sConf.getIdField()))); - Text theValue = new Text(); - LuceneSeqFileHelper.populateValues(document, theValue, fields); - //if they are both empty, don't write - if (StringUtils.isBlank(theKey.toString()) && StringUtils.isBlank(theValue.toString())) { - context.getCounter(DataStatus.EMPTY_BOTH).increment(1); - return; - } - if (StringUtils.isBlank(theKey.toString())) { - context.getCounter(DataStatus.EMPTY_KEY).increment(1); - } else if (StringUtils.isBlank(theValue.toString())) { - context.getCounter(DataStatus.EMPTY_VALUE).increment(1); - } - context.write(theKey, theValue); - } - - @Override - protected void cleanup(Context context) throws IOException, InterruptedException { - segmentReader.close(); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/82e78a8c/integration/src/test/java/org/apache/mahout/text/AbstractLuceneStorageTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/text/AbstractLuceneStorageTest.java b/integration/src/test/java/org/apache/mahout/text/AbstractLuceneStorageTest.java deleted file mode 100644 index 3164092..0000000 --- a/integration/src/test/java/org/apache/mahout/text/AbstractLuceneStorageTest.java +++ /dev/null @@ -1,107 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.text; - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.Text; -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.util.Version; -import org.apache.mahout.common.MahoutTestCase; -import org.apache.mahout.common.Pair; -import org.apache.mahout.text.doc.MultipleFieldsDocument; -import org.apache.mahout.text.doc.NumericFieldDocument; -import org.apache.mahout.text.doc.SingleFieldDocument; -import org.apache.mahout.text.doc.TestDocument; - -/** - * Abstract test for working with Lucene storage. - */ -@Deprecated -public abstract class AbstractLuceneStorageTest extends MahoutTestCase { - - protected Path indexPath1; - protected Path indexPath2; - protected List<TestDocument> docs = new ArrayList<>(); - protected List<TestDocument> misshapenDocs = new ArrayList<>(); - - @Override - public void setUp() throws Exception { - super.setUp(); - indexPath1 = getTestTempDirPath("index1"); - indexPath2 = getTestTempDirPath("index2"); - for (int i = 0; i < 2000; i++) { - docs.add(new SingleFieldDocument(String.valueOf(i), "This is test document " + i)); - } - misshapenDocs.add(new SingleFieldDocument("", "This doc has an empty id")); - misshapenDocs.add(new SingleFieldDocument("empty_value", "")); - } - - protected void commitDocuments(Directory directory, Iterable<TestDocument> theDocs) throws IOException{ - IndexWriter indexWriter = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_46, new StandardAnalyzer(Version.LUCENE_46))); - - for (TestDocument singleFieldDocument : theDocs) { - indexWriter.addDocument(singleFieldDocument.asLuceneDocument()); - } - - indexWriter.commit(); - indexWriter.close(); - } - - protected void commitDocuments(Directory directory, TestDocument... documents) throws IOException { - commitDocuments(directory, Arrays.asList(documents)); - } - - protected void assertMultipleFieldsDocumentEquals(MultipleFieldsDocument expected, Pair<Text, Text> actual) { - assertEquals(expected.getId(), actual.getFirst().toString()); - assertEquals(expected.getField() + " " + expected.getField1() + " " + expected.getField2(), actual.getSecond().toString()); - } - - protected void assertNumericFieldEquals(NumericFieldDocument expected, Pair<Text, Text> actual) { - assertEquals(expected.getId(), actual.getFirst().toString()); - assertEquals(expected.getField() + " " + expected.getNumericField(), actual.getSecond().toString()); - } - - protected FSDirectory getDirectory(File indexPath) throws IOException { - return FSDirectory.open(indexPath); - } - - protected File getIndexPath1AsFile() { - return new File(indexPath1.toUri().getPath()); - } - - protected Path getIndexPath1() { - return indexPath1; - } - - protected File getIndexPath2AsFile() { - return new File(indexPath2.toUri().getPath()); - } - - protected Path getIndexPath2() { - return indexPath2; - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/82e78a8c/integration/src/test/java/org/apache/mahout/text/LuceneSegmentInputFormatTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/text/LuceneSegmentInputFormatTest.java b/integration/src/test/java/org/apache/mahout/text/LuceneSegmentInputFormatTest.java deleted file mode 100644 index ee81a32..0000000 --- a/integration/src/test/java/org/apache/mahout/text/LuceneSegmentInputFormatTest.java +++ /dev/null @@ -1,85 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.text; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.mapreduce.JobContext; -import org.apache.hadoop.mapreduce.JobID; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.text.doc.SingleFieldDocument; -import org.junit.After; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -import java.io.IOException; -import java.lang.reflect.InvocationTargetException; -import java.util.Collections; -import java.util.List; -@Deprecated -public class LuceneSegmentInputFormatTest extends AbstractLuceneStorageTest { - - private LuceneSegmentInputFormat inputFormat; - private JobContext jobContext; - private Configuration conf; - - @Before - public void before() throws Exception { - inputFormat = new LuceneSegmentInputFormat(); - LuceneStorageConfiguration lucene2SeqConf = new - LuceneStorageConfiguration(getConfiguration(), Collections.singletonList(indexPath1), new Path("output"), "id", Collections.singletonList("field")); - conf = lucene2SeqConf.serialize(); - - jobContext = getJobContext(conf, new JobID()); - } - - @After - public void after() throws IOException { - HadoopUtil.delete(conf, indexPath1); - } - - @Test - public void testGetSplits() throws IOException, InterruptedException { - SingleFieldDocument doc1 = new SingleFieldDocument("1", "This is simple document 1"); - SingleFieldDocument doc2 = new SingleFieldDocument("2", "This is simple document 2"); - SingleFieldDocument doc3 = new SingleFieldDocument("3", "This is simple document 3"); - - //generate 3 segments - commitDocuments(getDirectory(getIndexPath1AsFile()), doc1); - commitDocuments(getDirectory(getIndexPath1AsFile()), doc2); - commitDocuments(getDirectory(getIndexPath1AsFile()), doc3); - - List<LuceneSegmentInputSplit> splits = inputFormat.getSplits(jobContext); - Assert.assertEquals(3, splits.size()); - } - - // Use reflection to abstract this incompatibility between Hadoop 1 & 2 APIs. - private JobContext getJobContext(Configuration conf, JobID jobID) throws - ClassNotFoundException, NoSuchMethodException, IllegalAccessException, - InvocationTargetException, InstantiationException { - Class<? extends JobContext> clazz; - if (!JobContext.class.isInterface()) { - clazz = JobContext.class; - } else { - clazz = (Class<? extends JobContext>) - Class.forName("org.apache.hadoop.mapreduce.task.JobContextImpl"); - } - return clazz.getConstructor(Configuration.class, JobID.class) - .newInstance(conf, jobID); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/82e78a8c/integration/src/test/java/org/apache/mahout/text/LuceneSegmentInputSplitTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/text/LuceneSegmentInputSplitTest.java b/integration/src/test/java/org/apache/mahout/text/LuceneSegmentInputSplitTest.java deleted file mode 100644 index 5375610..0000000 --- a/integration/src/test/java/org/apache/mahout/text/LuceneSegmentInputSplitTest.java +++ /dev/null @@ -1,88 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.text; - -import org.apache.hadoop.conf.Configuration; -import org.apache.lucene.index.SegmentCommitInfo; -import org.apache.lucene.index.SegmentReader; -import org.apache.lucene.store.IOContext; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.text.doc.SingleFieldDocument; -import org.junit.After; -import org.junit.Before; -import org.junit.Test; - -import java.io.IOException; -import java.util.List; - -import static java.util.Arrays.asList; -@Deprecated -public class LuceneSegmentInputSplitTest extends AbstractLuceneStorageTest { - - private Configuration configuration; - - @Before - public void before() throws IOException { - configuration = getConfiguration(); - } - - @After - public void after() throws IOException { - HadoopUtil.delete(configuration, indexPath1); - } - - @Test - public void testGetSegment() throws Exception { - SingleFieldDocument doc1 = new SingleFieldDocument("1", "This is simple document 1"); - SingleFieldDocument doc2 = new SingleFieldDocument("2", "This is simple document 2"); - SingleFieldDocument doc3 = new SingleFieldDocument("3", "This is simple document 3"); - - List<SingleFieldDocument> docs = asList(doc1, doc2, doc3); - for (SingleFieldDocument doc : docs) { - commitDocuments(getDirectory(getIndexPath1AsFile()), doc); - } - - assertSegmentContainsOneDoc("_0"); - assertSegmentContainsOneDoc("_1"); - assertSegmentContainsOneDoc("_2"); - } - - @Test(expected = IllegalArgumentException.class) - public void testGetSegmentNonExistingSegment() throws Exception { - SingleFieldDocument doc1 = new SingleFieldDocument("1", "This is simple document 1"); - SingleFieldDocument doc2 = new SingleFieldDocument("2", "This is simple document 2"); - SingleFieldDocument doc3 = new SingleFieldDocument("3", "This is simple document 3"); - - List<SingleFieldDocument> docs = asList(doc1, doc2, doc3); - for (SingleFieldDocument doc : docs) { - commitDocuments(getDirectory(getIndexPath1AsFile()), doc); - } - - LuceneSegmentInputSplit inputSplit = new LuceneSegmentInputSplit(indexPath1, "_3", 1000); - inputSplit.getSegment(configuration); - } - - private void assertSegmentContainsOneDoc(String segmentName) throws IOException { - LuceneSegmentInputSplit inputSplit = new LuceneSegmentInputSplit(indexPath1, segmentName, 1000); - SegmentCommitInfo segment = inputSplit.getSegment(configuration); - SegmentReader segmentReader = new SegmentReader(segment, 1, IOContext.READ);//SegmentReader.get(true, segment, 1); - assertEquals(segmentName, segment.info.name); - assertEquals(1, segmentReader.numDocs()); - } - - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/82e78a8c/integration/src/test/java/org/apache/mahout/text/LuceneSegmentRecordReaderTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/text/LuceneSegmentRecordReaderTest.java b/integration/src/test/java/org/apache/mahout/text/LuceneSegmentRecordReaderTest.java deleted file mode 100644 index 8a23ecb..0000000 --- a/integration/src/test/java/org/apache/mahout/text/LuceneSegmentRecordReaderTest.java +++ /dev/null @@ -1,121 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.text; - -import java.io.IOException; -import java.lang.reflect.InvocationTargetException; -import java.util.Collections; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.mapreduce.TaskAttemptContext; -import org.apache.hadoop.mapreduce.TaskAttemptID; -import org.apache.lucene.index.SegmentCommitInfo; -import org.apache.lucene.index.SegmentInfos; -import org.apache.mahout.common.HadoopUtil; -import org.junit.After; -import org.junit.Before; -import org.junit.Test; - -import static org.apache.mahout.text.doc.SingleFieldDocument.FIELD; -import static org.apache.mahout.text.doc.SingleFieldDocument.ID_FIELD; -@Deprecated -public class LuceneSegmentRecordReaderTest extends AbstractLuceneStorageTest { - private Configuration configuration; - - private LuceneSegmentRecordReader recordReader; - - private SegmentInfos segmentInfos; - - @Before - public void before() throws IOException, InterruptedException { - LuceneStorageConfiguration lucene2SeqConf = new LuceneStorageConfiguration(getConfiguration(), - Collections.singletonList(getIndexPath1()), new Path("output"), ID_FIELD, - Collections.singletonList(FIELD)); - configuration = lucene2SeqConf.serialize(); - recordReader = new LuceneSegmentRecordReader(); - commitDocuments(getDirectory(getIndexPath1AsFile()), docs.subList(0, 500)); - commitDocuments(getDirectory(getIndexPath1AsFile()), docs.subList(500, 1000)); - segmentInfos = new SegmentInfos(); - segmentInfos.read(getDirectory(getIndexPath1AsFile())); - } - - @After - public void after() throws IOException { - HadoopUtil.delete(configuration, getIndexPath1()); - } - - @Test - public void testKey() throws Exception { - for (SegmentCommitInfo segmentInfo : segmentInfos) { - int docId = 0; - LuceneSegmentInputSplit inputSplit = new LuceneSegmentInputSplit(getIndexPath1(), - segmentInfo.info.name, segmentInfo.sizeInBytes()); - TaskAttemptContext context = getTaskAttemptContext(configuration, new TaskAttemptID()); - recordReader.initialize(inputSplit, context); - for (int i = 0; i < 500; i++){ - recordReader.nextKeyValue(); - //we can't be sure of the order we are getting the segments, so we have to fudge here a bit on the id, - // but it is either id: i or i + 500 - assertTrue("i = " + i + " docId= " + - docId, String.valueOf(docId).equals(recordReader.getCurrentKey().toString()) || - String.valueOf(docId+500).equals(recordReader.getCurrentKey().toString())); - assertEquals(NullWritable.get(), recordReader.getCurrentValue()); - docId++; - } - } - } - - @Test(expected = IllegalArgumentException.class) - public void testNonExistingIdField() throws Exception { - configuration = new LuceneStorageConfiguration(getConfiguration(), - Collections.singletonList(getIndexPath1()), new Path("output"), "nonExistingId", - Collections.singletonList(FIELD)).serialize(); - SegmentCommitInfo segmentInfo = segmentInfos.iterator().next(); - LuceneSegmentInputSplit inputSplit = new LuceneSegmentInputSplit(getIndexPath1(), - segmentInfo.info.name, segmentInfo.sizeInBytes()); - TaskAttemptContext context = getTaskAttemptContext(configuration, new TaskAttemptID()); - recordReader.initialize(inputSplit, context); - } - - @Test(expected = IllegalArgumentException.class) - public void testNonExistingField() throws Exception { - configuration = new LuceneStorageConfiguration(getConfiguration(), Collections.singletonList(getIndexPath1()), - new Path("output"), ID_FIELD, Collections.singletonList("nonExistingField")).serialize(); - SegmentCommitInfo segmentInfo = segmentInfos.iterator().next(); - LuceneSegmentInputSplit inputSplit = new LuceneSegmentInputSplit(getIndexPath1(), - segmentInfo.info.name, segmentInfo.sizeInBytes()); - TaskAttemptContext context = getTaskAttemptContext(configuration, new TaskAttemptID()); - recordReader.initialize(inputSplit, context); - } - - // Use reflection to abstract this incompatibility between Hadoop 1 & 2 APIs. - private TaskAttemptContext getTaskAttemptContext(Configuration conf, TaskAttemptID jobID) throws - ClassNotFoundException, NoSuchMethodException, IllegalAccessException, - InvocationTargetException, InstantiationException { - Class<? extends TaskAttemptContext> clazz; - if (!TaskAttemptContext.class.isInterface()) { - clazz = TaskAttemptContext.class; - } else { - clazz = (Class<? extends TaskAttemptContext>) - Class.forName("org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl"); - } - return clazz.getConstructor(Configuration.class, TaskAttemptID.class) - .newInstance(conf, jobID); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/82e78a8c/integration/src/test/java/org/apache/mahout/text/LuceneStorageConfigurationTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/text/LuceneStorageConfigurationTest.java b/integration/src/test/java/org/apache/mahout/text/LuceneStorageConfigurationTest.java deleted file mode 100644 index e24066c..0000000 --- a/integration/src/test/java/org/apache/mahout/text/LuceneStorageConfigurationTest.java +++ /dev/null @@ -1,49 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.text; - -import java.io.IOException; -import java.util.Collections; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.mahout.common.MahoutTestCase; -import org.junit.Test; -@Deprecated -public class LuceneStorageConfigurationTest extends MahoutTestCase { - - @Test - public void testSerialization() throws Exception { - Configuration configuration = getConfiguration(); - Path indexPath = new Path("indexPath"); - Path outputPath = new Path("outputPath"); - LuceneStorageConfiguration luceneStorageConfiguration = - new LuceneStorageConfiguration(configuration, Collections.singletonList(indexPath), outputPath, - "id", Collections.singletonList("field")); - - Configuration serializedConfiguration = luceneStorageConfiguration.serialize(); - - LuceneStorageConfiguration deSerializedConfiguration = new LuceneStorageConfiguration(serializedConfiguration); - - assertEquals(luceneStorageConfiguration, deSerializedConfiguration); - } - - @Test(expected = IllegalArgumentException.class) - public void testSerializationNotSerialized() throws IOException { - new LuceneStorageConfiguration(getConfiguration()); - } -}
