http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/5a03ef61/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/AccumuloFreeTextIndexer.java ---------------------------------------------------------------------- diff --git a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/AccumuloFreeTextIndexer.java b/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/AccumuloFreeTextIndexer.java deleted file mode 100644 index f529569..0000000 --- a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/AccumuloFreeTextIndexer.java +++ /dev/null @@ -1,611 +0,0 @@ -package mvm.rya.indexing.accumulo.freetext; - -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - - - -import static mvm.rya.indexing.accumulo.freetext.query.ASTNodeUtils.getNodeIterator; -import info.aduna.iteration.CloseableIteration; - -import java.io.IOException; -import java.nio.charset.CharacterCodingException; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.Map.Entry; -import java.util.Set; -import java.util.SortedSet; -import java.util.TreeSet; - -import mvm.rya.accumulo.experimental.AbstractAccumuloIndexer; -import mvm.rya.api.domain.RyaStatement; -import mvm.rya.api.resolver.RyaToRdfConversions; -import mvm.rya.indexing.FreeTextIndexer; -import mvm.rya.indexing.StatementContraints; -import mvm.rya.indexing.accumulo.ConfigUtils; -import mvm.rya.indexing.accumulo.Md5Hash; -import mvm.rya.indexing.accumulo.StatementSerializer; -import mvm.rya.indexing.accumulo.freetext.iterators.BooleanTreeIterator; -import mvm.rya.indexing.accumulo.freetext.query.ASTExpression; -import mvm.rya.indexing.accumulo.freetext.query.ASTNodeUtils; -import mvm.rya.indexing.accumulo.freetext.query.ASTSimpleNode; -import mvm.rya.indexing.accumulo.freetext.query.ASTTerm; -import mvm.rya.indexing.accumulo.freetext.query.ParseException; -import mvm.rya.indexing.accumulo.freetext.query.QueryParser; -import mvm.rya.indexing.accumulo.freetext.query.QueryParserTreeConstants; -import mvm.rya.indexing.accumulo.freetext.query.SimpleNode; -import mvm.rya.indexing.accumulo.freetext.query.TokenMgrError; - -import org.apache.accumulo.core.client.AccumuloException; -import org.apache.accumulo.core.client.AccumuloSecurityException; -import org.apache.accumulo.core.client.BatchWriter; -import org.apache.accumulo.core.client.IteratorSetting; -import org.apache.accumulo.core.client.MultiTableBatchWriter; -import org.apache.accumulo.core.client.MutationsRejectedException; -import org.apache.accumulo.core.client.Scanner; -import org.apache.accumulo.core.client.TableExistsException; -import org.apache.accumulo.core.client.TableNotFoundException; -import org.apache.accumulo.core.client.admin.TableOperations; -import org.apache.accumulo.core.data.Key; -import org.apache.accumulo.core.data.Mutation; -import org.apache.accumulo.core.data.Range; -import org.apache.accumulo.core.data.Value; -import org.apache.accumulo.core.file.keyfunctor.ColumnFamilyFunctor; -import org.apache.accumulo.core.iterators.user.IntersectingIterator; -import org.apache.commons.lang.StringUtils; -import org.apache.commons.lang.Validate; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.log4j.Logger; -import org.openrdf.model.Literal; -import org.openrdf.model.Statement; -import org.openrdf.model.URI; -import org.openrdf.query.QueryEvaluationException; - -import com.google.common.base.Charsets; - -/** - * The {@link AccumuloFreeTextIndexer} stores and queries "free text" data from statements into tables in Accumulo. Specifically, this class - * stores data into two different Accumulo Tables. This is the <b>document table</b> (default name: triplestore_text) and the <b>terms - * table</b> (default name: triplestore_terms). - * <p> - * The document table stores the document (i.e. a triple statement), document properties, and the terms within the document. This is the - * main table used for processing a text search by using document partitioned indexing. See {@link IntersectingIterator}. - * <p> - * For each document, the document table will store the following information: - * <P> - * - * <pre> - * Row (partition) | Column Family | Column Qualifier | Value - * ================+================+==================+========== - * shardID | d\x00 | documentHash | Document - * shardID | s\x00Subject | documentHash | (empty) - * shardID | p\x00Predicate | documentHash | (empty) - * shardID | o\x00Object | documentHash | (empty) - * shardID | c\x00Context | documentHash | (empty) - * shardID | t\x00token | documentHash | (empty) - * </pre> - * <p> - * Note: documentHash is a sha256 Hash of the Document's Content - * <p> - * The terms table is used for expanding wildcard search terms. For each token in the document table, the table sill store the following - * information: - * - * <pre> - * Row (partition) | CF/CQ/Value - * ==================+============= - * l\x00token | (empty) - * r\x00Reversetoken | (empty) - * </pre> - * <p> - * There are two prefixes in the table, "token list" (keys with an "l" prefix) and "reverse token list" (keys with a "r" prefix). This table - * is uses the "token list" to expand foo* into terms like food, foot, and football. This table uses the "reverse token list" to expand *ar - * into car, bar, and far. - * <p> - * Example: Given these three statements as inputs: - * - * <pre> - * <uri:paul> rdfs:label "paul smith"@en <uri:graph1> - * <uri:steve> rdfs:label "steven anthony miller"@en <uri:graph1> - * <uri:steve> rdfs:label "steve miller"@en <uri:graph1> - * </pre> - * <p> - * Here's what the tables would look like: (Note: the hashes aren't real, the rows are not sorted, and the partition ids will vary.) - * <p> - * Triplestore_text - * - * <pre> - * Row (partition) | Column Family | Column Qualifier | Value - * ================+=================================+==================+========== - * 000000 | d\x00 | 08b3d233a | uri:graph1x00uri:paul\x00rdfs:label\x00"paul smith"@en - * 000000 | s\x00uri:paul | 08b3d233a | (empty) - * 000000 | p\x00rdfs:label | 08b3d233a | (empty) - * 000000 | o\x00"paul smith"@en | 08b3d233a | (empty) - * 000000 | c\x00uri:graph1 | 08b3d233a | (empty) - * 000000 | t\x00paul | 08b3d233a | (empty) - * 000000 | t\x00smith | 08b3d233a | (empty) - * - * 000000 | d\x00 | 3a575534b | uri:graph1x00uri:steve\x00rdfs:label\x00"steven anthony miller"@en - * 000000 | s\x00uri:steve | 3a575534b | (empty) - * 000000 | p\x00rdfs:label | 3a575534b | (empty) - * 000000 | o\x00"steven anthony miller"@en | 3a575534b | (empty) - * 000000 | c\x00uri:graph1 | 3a575534b | (empty) - * 000000 | t\x00steven | 3a575534b | (empty) - * 000000 | t\x00anthony | 3a575534b | (empty) - * 000000 | t\x00miller | 3a575534b | (empty) - * - * 000001 | d\x00 | 7bf670d06 | uri:graph1x00uri:steve\x00rdfs:label\x00"steve miller"@en - * 000001 | s\x00uri:steve | 7bf670d06 | (empty) - * 000001 | p\x00rdfs:label | 7bf670d06 | (empty) - * 000001 | o\x00"steve miller"@en | 7bf670d06 | (empty) - * 000001 | c\x00uri:graph1 | 7bf670d06 | (empty) - * 000001 | t\x00steve | 7bf670d06 | (empty) - * 000001 | t\x00miller | 7bf670d06 | (empty) - * </pre> - * <p> - * triplestore_terms - * <p> - * - * <pre> - * Row (partition) | CF/CQ/Value - * ==================+============= - * l\x00paul | (empty) - * l\x00smith | (empty) - * l\x00steven | (empty) - * l\x00anthony | (empty) - * l\x00miller | (empty) - * l\x00steve | (empty) - * r\x00luap | (empty) - * r\x00htims | (empty) - * r\x00nevets | (empty) - * r\x00ynohtna | (empty) - * r\x00rellim | (empty) - * r\x00evets | (empty) - * - * <pre> - */ -public class AccumuloFreeTextIndexer extends AbstractAccumuloIndexer implements FreeTextIndexer { - private static final Logger logger = Logger.getLogger(AccumuloFreeTextIndexer.class); - - private static final byte[] EMPTY_BYTES = new byte[] {}; - private static final Text EMPTY_TEXT = new Text(EMPTY_BYTES); - private static final Value EMPTY_VALUE = new Value(EMPTY_BYTES); - - private Tokenizer tokenizer; - - private BatchWriter docTableBw; - private BatchWriter termTableBw; - private MultiTableBatchWriter mtbw; - - private int queryTermLimit; - - private int docTableNumPartitions; - - private Set<URI> validPredicates; - - private Configuration conf; - - private boolean isInit = false; - - - private void init() throws AccumuloException, AccumuloSecurityException, TableNotFoundException, - TableExistsException { - String doctable = ConfigUtils.getFreeTextDocTablename(conf); - String termtable = ConfigUtils.getFreeTextTermTablename(conf); - - docTableNumPartitions = ConfigUtils.getFreeTextDocNumPartitions(conf); - int termTableNumPartitions = ConfigUtils.getFreeTextTermNumPartitions(conf); - - TableOperations tableOps = ConfigUtils.getConnector(conf).tableOperations(); - - // Create term table partitions - boolean createdTermTable = ConfigUtils.createTableIfNotExists(conf, termtable); - if (createdTermTable && !ConfigUtils.useMockInstance(conf) && termTableNumPartitions > 0) { - TreeSet<Text> splits = new TreeSet<Text>(); - - // split on the "Term List" and "Reverse Term list" boundary - splits.add(new Text(ColumnPrefixes.getRevTermListColFam(""))); - - // Symmetrically split the "Term List" and "Reverse Term list" - int numSubpartitions = ((termTableNumPartitions - 1) / 2); - if (numSubpartitions > 0) { - int step = (26 / numSubpartitions); - for (int i = 0; i < numSubpartitions; i++) { - String nextChar = String.valueOf((char) ('a' + (step * i))); - splits.add(new Text(ColumnPrefixes.getTermListColFam(nextChar))); - splits.add(new Text(ColumnPrefixes.getRevTermListColFam(nextChar))); - } - } - tableOps.addSplits(termtable, splits); - } - - // Create document (text) table partitions - boolean createdDocTable = ConfigUtils.createTableIfNotExists(conf, doctable); - if (createdDocTable && !ConfigUtils.useMockInstance(conf)) { - TreeSet<Text> splits = new TreeSet<Text>(); - for (int i = 0; i < docTableNumPartitions; i++) { - splits.add(genPartition(i, docTableNumPartitions)); - } - tableOps.addSplits(doctable, splits); - - // Add a tablet level Bloom filter for the Column Family. - // This will allow us to quickly determine if a term is contained in a tablet. - tableOps.setProperty(doctable, "table.bloom.key.functor", ColumnFamilyFunctor.class.getCanonicalName()); - tableOps.setProperty(doctable, "table.bloom.enabled", Boolean.TRUE.toString()); - } - - mtbw = ConfigUtils.createMultitableBatchWriter(conf); - - docTableBw = mtbw.getBatchWriter(doctable); - termTableBw = mtbw.getBatchWriter(termtable); - - tokenizer = ConfigUtils.getFreeTextTokenizer(conf); - validPredicates = ConfigUtils.getFreeTextPredicates(conf); - - queryTermLimit = ConfigUtils.getFreeTextTermLimit(conf); - } - - - //initialization occurs in setConf because index is created using reflection - @Override - public void setConf(Configuration conf) { - this.conf = conf; - if (!isInit) { - try { - init(); - isInit = true; - } catch (AccumuloException e) { - logger.warn("Unable to initialize index. Throwing Runtime Exception. ", e); - throw new RuntimeException(e); - } catch (AccumuloSecurityException e) { - logger.warn("Unable to initialize index. Throwing Runtime Exception. ", e); - throw new RuntimeException(e); - } catch (TableNotFoundException e) { - logger.warn("Unable to initialize index. Throwing Runtime Exception. ", e); - throw new RuntimeException(e); - } catch (TableExistsException e) { - logger.warn("Unable to initialize index. Throwing Runtime Exception. ", e); - throw new RuntimeException(e); - } - } - } - - @Override - public Configuration getConf() { - return this.conf; - } - - - private void storeStatement(Statement statement) throws IOException { - // if the predicate list is empty, accept all predicates. - // Otherwise, make sure the predicate is on the "valid" list - boolean isValidPredicate = validPredicates.isEmpty() || validPredicates.contains(statement.getPredicate()); - - if (isValidPredicate && (statement.getObject() instanceof Literal)) { - - // Get the tokens - String text = statement.getObject().stringValue().toLowerCase(); - SortedSet<String> tokens = tokenizer.tokenize(text); - - if (!tokens.isEmpty()) { - // Get Document Data - String docContent = StatementSerializer.writeStatement(statement); - - String docId = Md5Hash.md5Base64(docContent); - - // Setup partition - Text partition = genPartition(docContent.hashCode(), docTableNumPartitions); - - Mutation docTableMut = new Mutation(partition); - List<Mutation> termTableMutations = new ArrayList<Mutation>(); - - Text docIdText = new Text(docId); - - // Store the Document Data - docTableMut.put(ColumnPrefixes.DOCS_CF_PREFIX, docIdText, new Value(docContent.getBytes(Charsets.UTF_8))); - - // index the statement parts - docTableMut.put(ColumnPrefixes.getSubjColFam(statement), docIdText, EMPTY_VALUE); - docTableMut.put(ColumnPrefixes.getPredColFam(statement), docIdText, EMPTY_VALUE); - docTableMut.put(ColumnPrefixes.getObjColFam(statement), docIdText, EMPTY_VALUE); - docTableMut.put(ColumnPrefixes.getContextColFam(statement), docIdText, EMPTY_VALUE); - - // index the statement terms - for (String token : tokens) { - // tie the token to the document - docTableMut.put(ColumnPrefixes.getTermColFam(token), docIdText, EMPTY_VALUE); - - // store the term in the term table (useful for wildcard searches) - termTableMutations.add(createEmptyPutMutation(ColumnPrefixes.getTermListColFam(token))); - termTableMutations.add(createEmptyPutMutation(ColumnPrefixes.getRevTermListColFam(token))); - } - - // write the mutations - try { - docTableBw.addMutation(docTableMut); - termTableBw.addMutations(termTableMutations); - } catch (MutationsRejectedException e) { - logger.error("error adding mutation", e); - throw new IOException(e); - } - - } - - } - } - - @Override - public void storeStatement(RyaStatement statement) throws IOException { - storeStatement(RyaToRdfConversions.convertStatement(statement)); - } - - private static Mutation createEmptyPutMutation(Text row) { - Mutation m = new Mutation(row); - m.put(EMPTY_TEXT, EMPTY_TEXT, EMPTY_VALUE); - return m; - } - - private static Text genPartition(int partition, int numParitions) { - int length = Integer.toString(numParitions).length(); - return new Text(String.format("%0" + length + "d", Math.abs(partition % numParitions))); - } - - @Override - public Set<URI> getIndexablePredicates() { - return validPredicates; - } - - /** {@inheritDoc} */ - @Override - public void flush() throws IOException { - try { - mtbw.flush(); - } catch (MutationsRejectedException e) { - logger.error("error flushing the batch writer", e); - throw new IOException(e); - } - } - - /** {@inheritDoc} */ - @Override - public void close() throws IOException { - try { - mtbw.close(); - } catch (MutationsRejectedException e) { - logger.error("error closing the batch writer", e); - throw new IOException(e); - } - } - - private Set<String> unrollWildcard(String string, boolean reverse) throws IOException { - Scanner termTableScan = getScanner(ConfigUtils.getFreeTextTermTablename(conf)); - - Set<String> unrolledTerms = new HashSet<String>(); - - Text queryTerm; - if (reverse) { - String t = StringUtils.removeStart(string, "*").toLowerCase(); - queryTerm = ColumnPrefixes.getRevTermListColFam(t); - } else { - String t = StringUtils.removeEnd(string, "*").toLowerCase(); - queryTerm = ColumnPrefixes.getTermListColFam(t); - } - - // perform query and read results - termTableScan.setRange(Range.prefix(queryTerm)); - - for (Entry<Key, Value> e : termTableScan) { - String term = ColumnPrefixes.removePrefix(e.getKey().getRow()).toString(); - if (reverse) { - unrolledTerms.add(StringUtils.reverse(term)); - } else { - unrolledTerms.add(term); - } - } - - if (unrolledTerms.isEmpty()) { - // put in a placeholder term that will never be in the index. - unrolledTerms.add("\1\1\1"); - } - - return unrolledTerms; - } - - private void unrollWildcards(SimpleNode node) throws IOException { - if (node instanceof ASTExpression || node instanceof ASTSimpleNode) { - for (SimpleNode n : getNodeIterator(node)) { - unrollWildcards(n); - } - } else if (node instanceof ASTTerm) { - ASTTerm term = (ASTTerm) node; - boolean isWildTerm = term.getType().equals(ASTTerm.WILDTERM); - boolean isPreWildTerm = term.getType().equals(ASTTerm.PREFIXTERM); - if (isWildTerm || isPreWildTerm) { - Set<String> unrolledTerms = unrollWildcard(term.getTerm(), isPreWildTerm); - - // create a new expression - ASTExpression newExpression = new ASTExpression(QueryParserTreeConstants.JJTEXPRESSION); - newExpression.setType(ASTExpression.OR); - newExpression.setNotFlag(term.isNotFlag()); - - for (String unrolledTerm : unrolledTerms) { - ASTTerm t = new ASTTerm(QueryParserTreeConstants.JJTTERM); - t.setNotFlag(false); - t.setTerm(unrolledTerm); - t.setType(ASTTerm.TERM); - ASTNodeUtils.pushChild(newExpression, t); - } - - // replace "term" node with "expression" node in "term" node parent - SimpleNode parent = (SimpleNode) term.jjtGetParent(); - int index = ASTNodeUtils.getChildIndex(parent, term); - - Validate.isTrue(index >= 0, "child not found in parent"); - - parent.jjtAddChild(newExpression, index); - } - - } else { - throw new IllegalArgumentException("Node is of unknown type: " + node.getClass().getName()); - } - } - - private Scanner getScanner(String tablename) throws IOException { - try { - return ConfigUtils.createScanner(tablename, conf); - } catch (AccumuloException e) { - logger.error("Error connecting to " + tablename); - throw new IOException(e); - } catch (AccumuloSecurityException e) { - logger.error("Error connecting to " + tablename); - throw new IOException(e); - } catch (TableNotFoundException e) { - logger.error("Error connecting to " + tablename); - throw new IOException(e); - } - } - - /** {@inheritDoc} */ - @Override - public CloseableIteration<Statement, QueryEvaluationException> queryText(String query, StatementContraints contraints) - throws IOException { - Scanner docTableScan = getScanner(ConfigUtils.getFreeTextDocTablename(conf)); - - // test the query to see if it's parses correctly. - SimpleNode root = parseQuery(query); - - // unroll any wildcard nodes before it goes to the server - unrollWildcards(root); - - String unrolledQuery = ASTNodeUtils.serializeExpression(root); - - // Add S P O C constraints to query - StringBuilder constrainedQuery = new StringBuilder("(" + unrolledQuery + ")"); - - if (contraints.hasSubject()) { - constrainedQuery.append(" AND "); - constrainedQuery.append(ColumnPrefixes.getSubjColFam(contraints.getSubject().toString()).toString()); - } - if (contraints.hasContext()) { - constrainedQuery.append(" AND "); - constrainedQuery.append(ColumnPrefixes.getContextColFam(contraints.getContext().toString()).toString()); - } - if (contraints.hasPredicates()) { - constrainedQuery.append(" AND ("); - List<String> predicates = new ArrayList<String>(); - for (URI u : contraints.getPredicates()) { - predicates.add(ColumnPrefixes.getPredColFam(u.stringValue()).toString()); - } - constrainedQuery.append(StringUtils.join(predicates, " OR ")); - constrainedQuery.append(")"); - } - - // Verify that the query is a reasonable size - root = parseQuery(constrainedQuery.toString()); - int termCount = ASTNodeUtils.termCount(root); - - if (termCount > queryTermLimit) { - throw new IOException("Query contains too many terms. Term limit: " + queryTermLimit + ". Term Count: " + termCount); - } - - // perform query - docTableScan.clearScanIterators(); - docTableScan.clearColumns(); - - int iteratorPriority = 20; - String iteratorName = "booleanTree"; - IteratorSetting ii = new IteratorSetting(iteratorPriority, iteratorName, BooleanTreeIterator.class); - BooleanTreeIterator.setQuery(ii, constrainedQuery.toString()); - docTableScan.addScanIterator(ii); - docTableScan.setRange(new Range()); - - return getIteratorWrapper(docTableScan); - } - - private static CloseableIteration<Statement, QueryEvaluationException> getIteratorWrapper(final Scanner s) { - - final Iterator<Entry<Key, Value>> i = s.iterator(); - - return new CloseableIteration<Statement, QueryEvaluationException>() { - @Override - public boolean hasNext() { - return i.hasNext(); - } - - @Override - public Statement next() throws QueryEvaluationException { - Entry<Key, Value> entry = i.next(); - Value v = entry.getValue(); - try { - String dataString = Text.decode(v.get(), 0, v.getSize()); - Statement s = StatementSerializer.readStatement(dataString); - return s; - } catch (CharacterCodingException e) { - logger.error("Error decoding value", e); - throw new QueryEvaluationException(e); - } catch (IOException e) { - logger.error("Error deserializing statement", e); - throw new QueryEvaluationException(e); - } - } - - @Override - public void remove() { - throw new UnsupportedOperationException("Remove not implemented"); - } - - @Override - public void close() throws QueryEvaluationException { - s.close(); - } - }; - } - - /** - * Simple adapter that parses the query using {@link QueryParser}. Note: any checked exceptions thrown by {@link QueryParser} are - * re-thrown as {@link IOException}s. - * - * @param query - * @return - * @throws IOException - */ - private static SimpleNode parseQuery(String query) throws IOException { - SimpleNode root = null; - try { - root = QueryParser.parse(query); - } catch (ParseException e) { - logger.error("Parser Exception on Client Side. Query: " + query, e); - throw new IOException(e); - } catch (TokenMgrError e) { - logger.error("Token Manager Exception on Client Side. Query: " + query, e); - throw new IOException(e); - } - return root; - } - - - @Override - public String getTableName() { - return ConfigUtils.getFreeTextDocTablename(conf); - } - - -}
http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/5a03ef61/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/ColumnPrefixes.java ---------------------------------------------------------------------- diff --git a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/ColumnPrefixes.java b/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/ColumnPrefixes.java deleted file mode 100644 index 31666c9..0000000 --- a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/ColumnPrefixes.java +++ /dev/null @@ -1,120 +0,0 @@ -package mvm.rya.indexing.accumulo.freetext; - -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - - - -import java.nio.ByteBuffer; -import java.nio.charset.CharacterCodingException; - -import mvm.rya.indexing.accumulo.StatementSerializer; - -import org.apache.commons.lang.StringUtils; -import org.apache.hadoop.io.Text; -import org.openrdf.model.Statement; - -/** - * Row ID: shardId - * <p> - * CF: CF Prefix + Term - */ -public class ColumnPrefixes { - public static final Text DOCS_CF_PREFIX = new Text("d\0"); - public static final Text TERM_CF_PREFIX = new Text("t\0"); - public static final Text TERM_LIST_CF_PREFIX = new Text("l\0"); - public static final Text REVERSE_TERM_LIST_CF_PREFIX = new Text("r\0"); - - public static final Text SUBJECT_CF_PREFIX = new Text("s\0"); - public static final Text PREDICATE_CF_PREFIX = new Text("p\0"); - public static final Text OBJECT_CF_PREFIX = new Text("o\0"); - public static final Text CONTEXT_CF_PREFIX = new Text("c\0"); - - private static Text concat(Text prefix, String str) { - Text temp = new Text(prefix); - - try { - ByteBuffer buffer = Text.encode(str, false); - temp.append(buffer.array(), 0, buffer.limit()); - } catch (CharacterCodingException cce) { - throw new IllegalArgumentException(cce); - } - - return temp; - } - - public static Text getTermColFam(String term) { - return concat(TERM_CF_PREFIX, term); - } - - public static Text getTermListColFam(String term) { - return concat(TERM_LIST_CF_PREFIX, term); - } - - public static Text getRevTermListColFam(String term) { - return concat(REVERSE_TERM_LIST_CF_PREFIX, StringUtils.reverse(term)); - } - - public static Text getDocColFam(String term) { - return concat(DOCS_CF_PREFIX, term); - } - - public static Text getSubjColFam(String term) { - return concat(SUBJECT_CF_PREFIX, term); - } - - public static Text getSubjColFam(Statement statement) { - String subj = StatementSerializer.writeSubject(statement); - return getSubjColFam(subj); - } - - public static Text getPredColFam(String term) { - return concat(PREDICATE_CF_PREFIX, term); - } - - public static Text getPredColFam(Statement statement) { - String pred = StatementSerializer.writePredicate(statement); - return getPredColFam(pred); - } - - public static Text getObjColFam(String term) { - return concat(OBJECT_CF_PREFIX, term); - } - - public static Text getObjColFam(Statement statement) { - String obj = StatementSerializer.writeObject(statement); - return getObjColFam(obj); - } - - public static Text getContextColFam(String term) { - return concat(CONTEXT_CF_PREFIX, term); - } - - public static Text getContextColFam(Statement statement) { - String cont = StatementSerializer.writeContext(statement); - return getContextColFam(cont); - } - - public static Text removePrefix(Text termWithPrefix) { - Text temp = new Text(); - temp.set(termWithPrefix.getBytes(), 2, termWithPrefix.getLength() - 2); - return temp; - } - -} http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/5a03ef61/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/FreeTextTupleSet.java ---------------------------------------------------------------------- diff --git a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/FreeTextTupleSet.java b/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/FreeTextTupleSet.java deleted file mode 100644 index 471870b..0000000 --- a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/FreeTextTupleSet.java +++ /dev/null @@ -1,160 +0,0 @@ -package mvm.rya.indexing.accumulo.freetext; - -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - - -import info.aduna.iteration.CloseableIteration; - -import java.io.IOException; -import java.util.Set; - -import mvm.rya.indexing.FreeTextIndexer; -import mvm.rya.indexing.IndexingExpr; -import mvm.rya.indexing.IteratorFactory; -import mvm.rya.indexing.SearchFunction; -import mvm.rya.indexing.StatementContraints; -import mvm.rya.indexing.external.tupleSet.ExternalTupleSet; - -import org.apache.hadoop.conf.Configuration; -import org.openrdf.model.Statement; -import org.openrdf.model.URI; -import org.openrdf.query.BindingSet; -import org.openrdf.query.QueryEvaluationException; -import org.openrdf.query.algebra.QueryModelVisitor; - -import com.google.common.base.Joiner; - - -//Indexing Node for freetext expressions to be inserted into execution plan -//to delegate freetext portion of query to free text index -public class FreeTextTupleSet extends ExternalTupleSet { - - private Configuration conf; - private FreeTextIndexer freeTextIndexer; - private IndexingExpr filterInfo; - - - public FreeTextTupleSet(IndexingExpr filterInfo, FreeTextIndexer freeTextIndexer) { - this.filterInfo = filterInfo; - this.freeTextIndexer = freeTextIndexer; - this.conf = freeTextIndexer.getConf(); - } - - /** - * {@inheritDoc} - */ - @Override - public Set<String> getBindingNames() { - return filterInfo.getBindingNames(); - } - - /** - * {@inheritDoc} - * <p> - * Note that we need a deep copy for everything that (during optimizations) - * can be altered via {@link #visitChildren(QueryModelVisitor)} - */ - public FreeTextTupleSet clone() { - return new FreeTextTupleSet(filterInfo, freeTextIndexer); - } - - @Override - public double cardinality() { - return 0.0; // No idea how the estimate cardinality here. - } - - - - - @Override - public String getSignature() { - - return "(FreeTextTuple Projection) " + "variables: " + Joiner.on(", ").join(this.getBindingNames()).replaceAll("\\s+", " "); - } - - - - @Override - public boolean equals(Object other) { - if (other == this) { - return true; - } - if (!(other instanceof FreeTextTupleSet)) { - return false; - } - - FreeTextTupleSet arg = (FreeTextTupleSet) other; - return this.filterInfo.equals(arg.filterInfo); - } - - - @Override - public int hashCode() { - int result = 17; - result = 31*result + filterInfo.hashCode(); - - return result; - } - - - - /** - * Returns an iterator over the result set of the contained {@link IndexExpr}. - * <p> - * Should be thread-safe (concurrent invocation {@link OfflineIterable} this - * method can be expected with some query evaluators. - */ - @Override - public CloseableIteration<BindingSet, QueryEvaluationException> evaluate(BindingSet bindings) - throws QueryEvaluationException { - - - URI funcURI = filterInfo.getFunction(); - - SearchFunction searchFunction = new SearchFunction() { - - @Override - public CloseableIteration<Statement, QueryEvaluationException> performSearch(String queryText, - StatementContraints contraints) throws QueryEvaluationException { - try { - CloseableIteration<Statement, QueryEvaluationException> statements = freeTextIndexer.queryText( - queryText, contraints); - return statements; - } catch (IOException e) { - throw new QueryEvaluationException(e); - } - } - - @Override - public String toString() { - return "TEXT"; - }; - }; - - if (filterInfo.getArguments().length > 1) { - throw new IllegalArgumentException("Index functions do not support more than two arguments."); - } - - String queryText = filterInfo.getArguments()[0].stringValue(); - - return IteratorFactory.getIterator(filterInfo.getSpConstraint(), bindings, queryText, searchFunction); - } - -} http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/5a03ef61/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/LuceneTokenizer.java ---------------------------------------------------------------------- diff --git a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/LuceneTokenizer.java b/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/LuceneTokenizer.java deleted file mode 100644 index abda04a..0000000 --- a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/LuceneTokenizer.java +++ /dev/null @@ -1,57 +0,0 @@ -package mvm.rya.indexing.accumulo.freetext; - -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - - - -import java.io.IOException; -import java.io.StringReader; -import java.util.SortedSet; -import java.util.TreeSet; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.util.Version; - -/** - * A {@link Tokenizer} that delegates to Lucene functions - */ -public class LuceneTokenizer implements Tokenizer { - private static final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); - - @Override - public SortedSet<String> tokenize(String string) { - SortedSet<String> set = new TreeSet<String>(); - try { - TokenStream stream = analyzer.tokenStream(null, new StringReader(string)); - stream.reset(); - while (stream.incrementToken()) { - set.add(stream.getAttribute(CharTermAttribute.class).toString()); - } - } catch (IOException e) { - // not thrown b/c we're using a string reader... - throw new RuntimeException(e); - } - - return set; - } -} http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/5a03ef61/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/SimpleTokenizer.java ---------------------------------------------------------------------- diff --git a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/SimpleTokenizer.java b/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/SimpleTokenizer.java deleted file mode 100644 index e98e676..0000000 --- a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/SimpleTokenizer.java +++ /dev/null @@ -1,43 +0,0 @@ -package mvm.rya.indexing.accumulo.freetext; - -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - - - -import java.util.SortedSet; -import java.util.TreeSet; - -/** - * A {@link Tokenizer} that splits on whitespace. - */ -public class SimpleTokenizer implements Tokenizer { - - @Override - public SortedSet<String> tokenize(String sting) { - SortedSet<String> set = new TreeSet<String>(); - for (String token : sting.split("\\s+")) { - String t = token.trim().toLowerCase(); - if (!t.isEmpty()) { - set.add(t); - } - } - return set; - } -} http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/5a03ef61/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/Tokenizer.java ---------------------------------------------------------------------- diff --git a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/Tokenizer.java b/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/Tokenizer.java deleted file mode 100644 index 24b40cd..0000000 --- a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/Tokenizer.java +++ /dev/null @@ -1,31 +0,0 @@ -package mvm.rya.indexing.accumulo.freetext; - -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - - - -import java.util.SortedSet; - -/** - * A utility that splits a string into tokens. - */ -public interface Tokenizer { - public SortedSet<String> tokenize(String sting); -} http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/5a03ef61/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/iterators/AndingIterator.java ---------------------------------------------------------------------- diff --git a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/iterators/AndingIterator.java b/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/iterators/AndingIterator.java deleted file mode 100644 index 355fe14..0000000 --- a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/iterators/AndingIterator.java +++ /dev/null @@ -1,563 +0,0 @@ -package mvm.rya.indexing.accumulo.freetext.iterators; - -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - - - -import java.io.IOException; -import java.util.Collection; -import java.util.Collections; -import java.util.Map; - -import org.apache.accumulo.core.client.IteratorSetting; -import org.apache.accumulo.core.data.ArrayByteSequence; -import org.apache.accumulo.core.data.ByteSequence; -import org.apache.accumulo.core.data.Key; -import org.apache.accumulo.core.data.PartialKey; -import org.apache.accumulo.core.data.Range; -import org.apache.accumulo.core.data.Value; -import org.apache.accumulo.core.iterators.IteratorEnvironment; -import org.apache.accumulo.core.iterators.SortedKeyValueIterator; -import org.apache.accumulo.core.iterators.user.IntersectingIterator; -import org.apache.accumulo.core.util.TextUtil; -import org.apache.commons.codec.binary.Base64; -import org.apache.hadoop.io.Text; -import org.apache.log4j.Logger; - -/** - * Adapted from {@link IntersectingIterator} with very slight modifications. Specifically, the comparator on the TermSource internal class was - * modified to handle exhausted iterators and multiple rows per tablet server. - */ -public class AndingIterator implements SortedKeyValueIterator<Key, Value> { - - protected Text nullText = new Text(); - - protected Text getPartition(Key key) { - return key.getRow(); - } - - protected Text getTerm(Key key) { - return key.getColumnFamily(); - } - - protected Text getDocID(Key key) { - return key.getColumnQualifier(); - } - - protected Key buildKey(Text partition, Text term) { - return new Key(partition, (term == null) ? nullText : term); - } - - protected Key buildKey(Text partition, Text term, Text docID) { - return new Key(partition, (term == null) ? nullText : term, docID); - } - - protected Key buildFollowingPartitionKey(Key key) { - return key.followingKey(PartialKey.ROW); - } - - protected static final Logger log = Logger.getLogger(AndingIterator.class); - - protected static class TermSource { - public SortedKeyValueIterator<Key, Value> iter; - public Text term; - public Collection<ByteSequence> seekColfams; - public boolean notFlag; - - public TermSource(TermSource other) { - this.iter = other.iter; - this.term = other.term; - this.notFlag = other.notFlag; - this.seekColfams = other.seekColfams; - } - - public TermSource(SortedKeyValueIterator<Key, Value> iter, Text term) { - this(iter, term, false); - } - - public TermSource(SortedKeyValueIterator<Key, Value> iter, Text term, boolean notFlag) { - this.iter = iter; - this.term = term; - this.notFlag = notFlag; - // The desired column families for this source is the term itself - - // handle the case where the term is null. - if (term == null) { - this.seekColfams = Collections.<ByteSequence> emptyList(); - } else { - this.seekColfams = Collections.<ByteSequence> singletonList(new ArrayByteSequence(term.getBytes(), 0, term.getLength())); - } - } - - public String getTermString() { - return (this.term == null) ? new String("Iterator") : this.term.toString(); - } - } - - TermSource[] sources; - int sourcesCount = 0; - - Range overallRange; - - // query-time settings - protected Text currentPartition = null; - protected Text currentDocID = new Text(emptyByteArray); - static final byte[] emptyByteArray = new byte[0]; - - protected Key topKey = null; - protected Value value = new Value(emptyByteArray); - - public AndingIterator() { - } - - @Override - public SortedKeyValueIterator<Key, Value> deepCopy(IteratorEnvironment env) { - return new AndingIterator(this, env); - } - - private AndingIterator(AndingIterator other, IteratorEnvironment env) { - if (other.sources != null) { - sourcesCount = other.sourcesCount; - sources = new TermSource[sourcesCount]; - for (int i = 0; i < sourcesCount; i++) { - sources[i] = new TermSource(other.sources[i].iter.deepCopy(env), other.sources[i].term); - } - } - } - - @Override - public Key getTopKey() { - return topKey; - } - - @Override - public Value getTopValue() { - // we don't really care about values - return value; - } - - @Override - public boolean hasTop() { - return currentPartition != null; - } - - // precondition: currentRow is not null - private boolean seekOneSource(int sourceID) throws IOException { - // find the next key in the appropriate column family that is at or beyond the cursor (currentRow, currentCQ) - // advance the cursor if this source goes beyond it - // return whether we advanced the cursor - - // within this loop progress must be made in one of the following forms: - // - currentRow or currentCQ must be increased - // - the given source must advance its iterator - // this loop will end when any of the following criteria are met - // - the iterator for the given source is pointing to the key (currentRow, columnFamilies[sourceID], currentCQ) - // - the given source is out of data and currentRow is set to null - // - the given source has advanced beyond the endRow and currentRow is set to null - boolean advancedCursor = false; - - if (sources[sourceID].notFlag) { - while (true) { - if (sources[sourceID].iter.hasTop() == false) { - // an empty column that you are negating is a valid condition - break; - } - // check if we're past the end key - int endCompare = -1; - // we should compare the row to the end of the range - if (overallRange.getEndKey() != null) { - endCompare = overallRange.getEndKey().getRow().compareTo(sources[sourceID].iter.getTopKey().getRow()); - if ((!overallRange.isEndKeyInclusive() && endCompare <= 0) || endCompare < 0) { - // an empty column that you are negating is a valid condition - break; - } - } - int partitionCompare = currentPartition.compareTo(getPartition(sources[sourceID].iter.getTopKey())); - // check if this source is already at or beyond currentRow - // if not, then seek to at least the current row - - if (partitionCompare > 0) { - // seek to at least the currentRow - Key seekKey = buildKey(currentPartition, sources[sourceID].term); - sources[sourceID].iter.seek(new Range(seekKey, true, null, false), sources[sourceID].seekColfams, true); - continue; - } - // check if this source has gone beyond currentRow - // if so, this is a valid condition for negation - if (partitionCompare < 0) { - break; - } - // we have verified that the current source is positioned in currentRow - // now we must make sure we're in the right columnFamily in the current row - // Note: Iterators are auto-magically set to the correct columnFamily - if (sources[sourceID].term != null) { - int termCompare = sources[sourceID].term.compareTo(getTerm(sources[sourceID].iter.getTopKey())); - // check if this source is already on the right columnFamily - // if not, then seek forwards to the right columnFamily - if (termCompare > 0) { - Key seekKey = buildKey(currentPartition, sources[sourceID].term, currentDocID); - sources[sourceID].iter.seek(new Range(seekKey, true, null, false), sources[sourceID].seekColfams, true); - continue; - } - // check if this source is beyond the right columnFamily - // if so, then this is a valid condition for negating - if (termCompare < 0) { - break; - } - } - - // we have verified that we are in currentRow and the correct column family - // make sure we are at or beyond columnQualifier - Text docID = getDocID(sources[sourceID].iter.getTopKey()); - int docIDCompare = currentDocID.compareTo(docID); - // If we are past the target, this is a valid result - if (docIDCompare < 0) { - break; - } - // if this source is not yet at the currentCQ then advance in this source - if (docIDCompare > 0) { - // seek forwards - Key seekKey = buildKey(currentPartition, sources[sourceID].term, currentDocID); - sources[sourceID].iter.seek(new Range(seekKey, true, null, false), sources[sourceID].seekColfams, true); - continue; - } - // if we are equal to the target, this is an invalid result. - // Force the entire process to go to the next row. - // We are advancing column 0 because we forced that column to not contain a ! - // when we did the init() - if (docIDCompare == 0) { - sources[0].iter.next(); - advancedCursor = true; - break; - } - } - } else { - while (true) { - if (sources[sourceID].iter.hasTop() == false) { - currentPartition = null; - // setting currentRow to null counts as advancing the cursor - return true; - } - // check if we're past the end key - int endCompare = -1; - // we should compare the row to the end of the range - - if (overallRange.getEndKey() != null) { - endCompare = overallRange.getEndKey().getRow().compareTo(sources[sourceID].iter.getTopKey().getRow()); - if ((!overallRange.isEndKeyInclusive() && endCompare <= 0) || endCompare < 0) { - currentPartition = null; - // setting currentRow to null counts as advancing the cursor - return true; - } - } - int partitionCompare = currentPartition.compareTo(getPartition(sources[sourceID].iter.getTopKey())); - // check if this source is already at or beyond currentRow - // if not, then seek to at least the current row - if (partitionCompare > 0) { - // seek to at least the currentRow - Key seekKey = buildKey(currentPartition, sources[sourceID].term); - sources[sourceID].iter.seek(new Range(seekKey, true, null, false), sources[sourceID].seekColfams, true); - continue; - } - // check if this source has gone beyond currentRow - // if so, advance currentRow - if (partitionCompare < 0) { - currentPartition.set(getPartition(sources[sourceID].iter.getTopKey())); - currentDocID.set(emptyByteArray); - advancedCursor = true; - continue; - } - // we have verified that the current source is positioned in currentRow - // now we must make sure we're in the right columnFamily in the current row - // Note: Iterators are auto-magically set to the correct columnFamily - - if (sources[sourceID].term != null) { - int termCompare = sources[sourceID].term.compareTo(getTerm(sources[sourceID].iter.getTopKey())); - // check if this source is already on the right columnFamily - // if not, then seek forwards to the right columnFamily - if (termCompare > 0) { - Key seekKey = buildKey(currentPartition, sources[sourceID].term, currentDocID); - sources[sourceID].iter.seek(new Range(seekKey, true, null, false), sources[sourceID].seekColfams, true); - continue; - } - // check if this source is beyond the right columnFamily - // if so, then seek to the next row - if (termCompare < 0) { - // we're out of entries in the current row, so seek to the next one - // byte[] currentRowBytes = currentRow.getBytes(); - // byte[] nextRow = new byte[currentRowBytes.length + 1]; - // System.arraycopy(currentRowBytes, 0, nextRow, 0, currentRowBytes.length); - // nextRow[currentRowBytes.length] = (byte)0; - // // we should reuse text objects here - // sources[sourceID].seek(new Key(new Text(nextRow),columnFamilies[sourceID])); - if (endCompare == 0) { - // we're done - currentPartition = null; - // setting currentRow to null counts as advancing the cursor - return true; - } - Key seekKey = buildFollowingPartitionKey(sources[sourceID].iter.getTopKey()); - sources[sourceID].iter.seek(new Range(seekKey, true, null, false), sources[sourceID].seekColfams, true); - continue; - } - } - // we have verified that we are in currentRow and the correct column family - // make sure we are at or beyond columnQualifier - Text docID = getDocID(sources[sourceID].iter.getTopKey()); - int docIDCompare = currentDocID.compareTo(docID); - // if this source has advanced beyond the current column qualifier then advance currentCQ and return true - if (docIDCompare < 0) { - currentDocID.set(docID); - advancedCursor = true; - break; - } - // if this source is not yet at the currentCQ then seek in this source - if (docIDCompare > 0) { - // seek forwards - Key seekKey = buildKey(currentPartition, sources[sourceID].term, currentDocID); - sources[sourceID].iter.seek(new Range(seekKey, true, null, false), sources[sourceID].seekColfams, true); - continue; - } - // this source is at the current row, in its column family, and at currentCQ - break; - } - } - return advancedCursor; - } - - @Override - public void next() throws IOException { - if (currentPartition == null) { - return; - } - // precondition: the current row is set up and the sources all have the same column qualifier - // while we don't have a match, seek in the source with the smallest column qualifier - sources[0].iter.next(); - advanceToIntersection(); - } - - protected void advanceToIntersection() throws IOException { - boolean cursorChanged = true; - while (cursorChanged) { - // seek all of the sources to at least the highest seen column qualifier in the current row - cursorChanged = false; - for (int i = 0; i < sourcesCount; i++) { - if (currentPartition == null) { - topKey = null; - return; - } - if (seekOneSource(i)) { - cursorChanged = true; - break; - } - } - } - topKey = buildKey(currentPartition, nullText, currentDocID); - } - - public static String stringTopKey(SortedKeyValueIterator<Key, Value> iter) { - if (iter.hasTop()) - return iter.getTopKey().toString(); - return ""; - } - - private static final String columnFamiliesOptionName = "columnFamilies"; - private static final String notFlagOptionName = "notFlag"; - - /** - * @param columns - * @return encoded columns - * @deprecated since 1.4. To be made protected. Do not interact with flags string directly, just use - * {@link #setColumnFamilies(IteratorSetting, Text[], boolean[])}. - */ - public static String encodeColumns(Text[] columns) { - StringBuilder sb = new StringBuilder(); - for (int i = 0; i < columns.length; i++) { - sb.append(new String(Base64.encodeBase64(TextUtil.getBytes(columns[i])))); - sb.append('\n'); - } - return sb.toString(); - } - - /** - * @param flags - * @return encoded flags - * @deprecated since 1.4. To be made protected. Do not interact with flags string directly, just use - * {@link #setColumnFamilies(IteratorSetting, Text[], boolean[])}. - */ - public static String encodeBooleans(boolean[] flags) { - byte[] bytes = new byte[flags.length]; - for (int i = 0; i < flags.length; i++) { - if (flags[i]) - bytes[i] = 1; - else - bytes[i] = 0; - } - return new String(Base64.encodeBase64(bytes)); - } - - protected static Text[] decodeColumns(String columns) { - String[] columnStrings = columns.split("\n"); - Text[] columnTexts = new Text[columnStrings.length]; - for (int i = 0; i < columnStrings.length; i++) { - columnTexts[i] = new Text(Base64.decodeBase64(columnStrings[i].getBytes())); - } - return columnTexts; - } - - /** - * to be made protected - * - * @param flags - * @return decoded flags - * @deprecated since 1.4. To be made protected. Do not interact with flags string directly, just use - * {@link #setColumnFamilies(IteratorSetting, Text[], boolean[])}. - */ - public static boolean[] decodeBooleans(String flags) { - // return null of there were no flags - if (flags == null) - return null; - - byte[] bytes = Base64.decodeBase64(flags.getBytes()); - boolean[] bFlags = new boolean[bytes.length]; - for (int i = 0; i < bytes.length; i++) { - if (bytes[i] == 1) - bFlags[i] = true; - else - bFlags[i] = false; - } - return bFlags; - } - - @Override - public void init(SortedKeyValueIterator<Key, Value> source, Map<String, String> options, IteratorEnvironment env) throws IOException { - Text[] terms = decodeColumns(options.get(columnFamiliesOptionName)); - boolean[] notFlag = decodeBooleans(options.get(notFlagOptionName)); - - if (terms.length < 2) { - throw new IllegalArgumentException("IntersectionIterator requires two or more columns families"); - } - - // Scan the not flags. - // There must be at least one term that isn't negated - // And we are going to re-order such that the first term is not a ! term - if (notFlag == null) { - notFlag = new boolean[terms.length]; - for (int i = 0; i < terms.length; i++) - notFlag[i] = false; - } - if (notFlag[0]) { - for (int i = 1; i < notFlag.length; i++) { - if (notFlag[i] == false) { - Text swapFamily = new Text(terms[0]); - terms[0].set(terms[i]); - terms[i].set(swapFamily); - notFlag[0] = false; - notFlag[i] = true; - break; - } - } - if (notFlag[0]) { - throw new IllegalArgumentException("IntersectionIterator requires at lest one column family without not"); - } - } - - sources = new TermSource[terms.length]; - sources[0] = new TermSource(source, terms[0]); - for (int i = 1; i < terms.length; i++) { - sources[i] = new TermSource(source.deepCopy(env), terms[i], notFlag[i]); - } - sourcesCount = terms.length; - } - - @Override - public void seek(Range range, Collection<ByteSequence> seekColumnFamilies, boolean inclusive) throws IOException { - overallRange = new Range(range); - currentPartition = new Text(); - currentDocID.set(emptyByteArray); - - // seek each of the sources to the right column family within the row given by key - for (int i = 0; i < sourcesCount; i++) { - Key sourceKey; - if (range.getStartKey() != null) { - if (range.getStartKey().getColumnQualifier() != null) { - sourceKey = buildKey(getPartition(range.getStartKey()), sources[i].term, range.getStartKey().getColumnQualifier()); - } else { - sourceKey = buildKey(getPartition(range.getStartKey()), sources[i].term); - } - // Seek only to the term for this source as a column family - sources[i].iter.seek(new Range(sourceKey, true, null, false), sources[i].seekColfams, true); - } else { - // Seek only to the term for this source as a column family - sources[i].iter.seek(range, sources[i].seekColfams, true); - } - } - advanceToIntersection(); - } - - public void addSource(SortedKeyValueIterator<Key, Value> source, IteratorEnvironment env, Text term, boolean notFlag) { - // Check if we have space for the added Source - if (sources == null) { - sources = new TermSource[1]; - } else { - // allocate space for node, and copy current tree. - // TODO: Should we change this to an ArrayList so that we can just add() ? - TermSource[] localSources = new TermSource[sources.length + 1]; - int currSource = 0; - for (TermSource myTerm : sources) { - // TODO: Do I need to call new here? or can I just re-use the term? - localSources[currSource] = new TermSource(myTerm); - currSource++; - } - sources = localSources; - } - sources[sourcesCount] = new TermSource(source.deepCopy(env), term, notFlag); - sourcesCount++; - } - - /** - * Encode the columns to be used when iterating. - * - * @param cfg - * @param columns - */ - public static void setColumnFamilies(IteratorSetting cfg, Text[] columns) { - if (columns.length < 2) - throw new IllegalArgumentException("Must supply at least two terms to intersect"); - cfg.addOption(AndingIterator.columnFamiliesOptionName, AndingIterator.encodeColumns(columns)); - } - - /** - * Encode columns and NOT flags indicating which columns should be negated (docIDs will be excluded if matching negated columns, instead - * of included). - * - * @param cfg - * @param columns - * @param notFlags - */ - public static void setColumnFamilies(IteratorSetting cfg, Text[] columns, boolean[] notFlags) { - if (columns.length < 2) - throw new IllegalArgumentException("Must supply at least two terms to intersect"); - if (columns.length != notFlags.length) - throw new IllegalArgumentException("columns and notFlags arrays must be the same length"); - setColumnFamilies(cfg, columns); - cfg.addOption(AndingIterator.notFlagOptionName, AndingIterator.encodeBooleans(notFlags)); - } -} http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/5a03ef61/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/iterators/BooleanTreeIterator.java ---------------------------------------------------------------------- diff --git a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/iterators/BooleanTreeIterator.java b/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/iterators/BooleanTreeIterator.java deleted file mode 100644 index a69b78a..0000000 --- a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/iterators/BooleanTreeIterator.java +++ /dev/null @@ -1,322 +0,0 @@ -package mvm.rya.indexing.accumulo.freetext.iterators; - -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - - - -import static mvm.rya.indexing.accumulo.freetext.query.ASTNodeUtils.allChildrenAreNot; -import static mvm.rya.indexing.accumulo.freetext.query.ASTNodeUtils.findFirstNonNotChild; -import static mvm.rya.indexing.accumulo.freetext.query.ASTNodeUtils.getNodeIterator; -import static mvm.rya.indexing.accumulo.freetext.query.ASTNodeUtils.isNotFlag; -import static mvm.rya.indexing.accumulo.freetext.query.ASTNodeUtils.pushChild; -import static mvm.rya.indexing.accumulo.freetext.query.ASTNodeUtils.swapChildren; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.NoSuchElementException; - -import mvm.rya.indexing.accumulo.freetext.ColumnPrefixes; -import mvm.rya.indexing.accumulo.freetext.query.ASTExpression; -import mvm.rya.indexing.accumulo.freetext.query.ASTTerm; -import mvm.rya.indexing.accumulo.freetext.query.ParseException; -import mvm.rya.indexing.accumulo.freetext.query.QueryParser; -import mvm.rya.indexing.accumulo.freetext.query.QueryParserTreeConstants; -import mvm.rya.indexing.accumulo.freetext.query.SimpleNode; -import mvm.rya.indexing.accumulo.freetext.query.TokenMgrError; - -import org.apache.accumulo.core.client.IteratorSetting; -import org.apache.accumulo.core.data.ByteSequence; -import org.apache.accumulo.core.data.Key; -import org.apache.accumulo.core.data.Range; -import org.apache.accumulo.core.data.Value; -import org.apache.accumulo.core.iterators.IteratorEnvironment; -import org.apache.accumulo.core.iterators.OptionDescriber; -import org.apache.accumulo.core.iterators.SortedKeyValueIterator; -import org.apache.accumulo.core.iterators.system.MultiIterator; -import org.apache.commons.lang.Validate; -import org.apache.hadoop.io.Text; -import org.apache.log4j.Logger; - -public class BooleanTreeIterator implements SortedKeyValueIterator<Key, Value>, OptionDescriber { - private static Logger logger = Logger.getLogger(BooleanTreeIterator.class); - - private static String queryOptionName = "query"; - - private SortedKeyValueIterator<Key, Value> iter; - private SortedKeyValueIterator<Key, Value> docSource; - - @Override - public void init(SortedKeyValueIterator<Key, Value> source, Map<String, String> options, IteratorEnvironment env) throws IOException { - - // pull out the query - String query = options.get(queryOptionName); - - // create the parse tree - SimpleNode root; - try { - root = QueryParser.parse(query); - } catch (ParseException e) { - // log and wrap in IOException - logger.error("ParseException encountered while parsing: " + query, e); - throw new IOException(e); - } catch (TokenMgrError e) { - // log and wrap in IOException - logger.error("TokenMgrError encountered while parsing: " + query, e); - throw new IOException(e); - } - - docSource = source.deepCopy(env); - iter = createIterator((SimpleNode) root.jjtGetChild(0), source, env); - } - - private SortedKeyValueIterator<Key, Value> createIterator(SimpleNode root, SortedKeyValueIterator<Key, Value> source, - IteratorEnvironment env) { - // if the root is only a single term, wrap it in an expression node - if (root instanceof ASTTerm) { - ASTExpression expression = new ASTExpression(QueryParserTreeConstants.JJTEXPRESSION); - expression.setNotFlag(false); - expression.setType(ASTExpression.AND); - - pushChild(expression, root); - root.jjtSetParent(expression); - - root = expression; - } - - // Pre-process the tree to compensate for iterator specific issues with certain topologies - preProcessTree(root); - - // Build an iterator tree - return createIteratorRecursive(root, source, env); - } - - private SortedKeyValueIterator<Key, Value> createIteratorRecursive(SimpleNode node, SortedKeyValueIterator<Key, Value> source, - IteratorEnvironment env) { - - Validate.isTrue(node instanceof ASTExpression, "node must be of type ASTExpression. Node is instance of " - + node.getClass().getName()); - - ASTExpression expression = (ASTExpression) node; - - if (expression.getType().equals(ASTExpression.AND)) { - return getAndIterator(node, source, env); - } - - if (expression.getType().equals(ASTExpression.OR)) { - return getOrIterator(node, source, env); - } - - throw new IllegalArgumentException("Expression is of unknown type: " + expression.getType()); - - } - - private MultiIterator getOrIterator(SimpleNode node, SortedKeyValueIterator<Key, Value> source, IteratorEnvironment env) { - List<SortedKeyValueIterator<Key, Value>> iters = new ArrayList<SortedKeyValueIterator<Key, Value>>(); - - for (SimpleNode n : getNodeIterator(node)) { - if (n instanceof ASTExpression) { - iters.add(createIteratorRecursive(n, source, env)); - } else if (n instanceof ASTTerm) { - iters.add(getSimpleAndingIterator((ASTTerm) n, source, env)); - } else { - throw new IllegalArgumentException("Node is of unknown type: " + n.getClass().getName()); - } - } - - return new MultiIterator(iters, new Range()); - } - - private AndingIterator getAndIterator(SimpleNode node, SortedKeyValueIterator<Key, Value> source, IteratorEnvironment env) { - - AndingIterator anding = new AndingIterator(); - - for (SimpleNode n : getNodeIterator(node)) { - boolean isNotFlag = isNotFlag(n); - if (n instanceof ASTExpression) { - anding.addSource(createIteratorRecursive(n, source, env), env, null, isNotFlag); - } else if (n instanceof ASTTerm) { - ASTTerm term = ((ASTTerm) n); - anding.addSource(source, env, getTermColFam(term), isNotFlag); - } else { - throw new IllegalArgumentException("Node is of unknown type: " + n.getClass().getName()); - } - } - - return anding; - } - - private static Text getTermColFam(ASTTerm termnode) { - String term = termnode.getTerm(); - if (term == null) { - // if the term is null, then I want all of the documents - return ColumnPrefixes.DOCS_CF_PREFIX; - } - if (term.contains("\0")) { - // if the term is contain a null char, then it's already formated for a CF - return new Text(term); - } - - // otherwise, point to the term CF - return ColumnPrefixes.getTermColFam(term.toLowerCase()); - } - - private AndingIterator getSimpleAndingIterator(ASTTerm node, SortedKeyValueIterator<Key, Value> source, IteratorEnvironment env) { - Validate.isTrue(!node.isNotFlag(), "Simple Anding node must not have \"not\" flag set"); - - AndingIterator anding = new AndingIterator(); - anding.addSource(source, env, getTermColFam(node), false); - return anding; - } - - /** - * Handle "lonely nots" (i.e. expressions with only nots), "or" statements containing nots, and make sure that the first term in an - * "and" statement is not a not. This is due to implementation specific limitations of the iterators. - * <p> - * For example: - * <ul> - * <li>lonely nots: (!a & !b) -> [all] & !a & !b</li> - * <li>"or" nots: (!a | b) -> ( ([all] & !a) | b)</li> - * <li>reorder "and" nots: (!a & b) -> ( b & !a )</li> - * </ul> - **/ - public static void preProcessTree(SimpleNode s) { - for (SimpleNode child : getNodeIterator(s)) { - preProcessTree(child); - } - - if (s instanceof ASTExpression) { - ASTExpression expression = (ASTExpression) s; - - if (expression.getType().equals(ASTExpression.AND)) { - if (allChildrenAreNot(expression)) { - // lonely nots: (!a & !b) -> [all] & !a & !b - ASTTerm allDocsTerm = createAllDocTermNode(); - pushChild(expression, allDocsTerm); - } else if (isNotFlag(expression.jjtGetChild(0))) { - // reorder "and" nots: (!a & b) -> ( b & !a ) - int firstNonNotChild = findFirstNonNotChild(expression); - swapChildren(expression, 0, firstNonNotChild); - } - } - - if (expression.getType().equals(ASTExpression.OR)) { - for (int i = 0; i < expression.jjtGetNumChildren(); i++) { - SimpleNode child = (SimpleNode) expression.jjtGetChild(i); - if (isNotFlag(child)) { - // "or" nots: (!a | b) -> ( ([all] & !a) | b) - // create the new expression - ASTExpression newExpression = new ASTExpression(QueryParserTreeConstants.JJTEXPRESSION); - newExpression.setNotFlag(false); - newExpression.setType(ASTExpression.AND); - pushChild(newExpression, child); - pushChild(newExpression, createAllDocTermNode()); - - // tie the new expression to the old one - newExpression.jjtSetParent(expression); - expression.jjtAddChild(newExpression, i); - } - } - } - } - - } - - public static ASTTerm createAllDocTermNode() { - ASTTerm t = new ASTTerm(QueryParserTreeConstants.JJTTERM); - t.setNotFlag(false); - t.setType(ASTTerm.TERM); - // note: a "null" signifies "all docs" should be returned. - t.setTerm(null); - return t; - } - - @Override - public boolean hasTop() { - return iter.hasTop(); - } - - @Override - public void next() throws IOException { - iter.next(); - if (iter.hasTop()) { - seekDocSource(iter.getTopKey()); - } - } - - @Override - public void seek(Range range, Collection<ByteSequence> columnFamilies, boolean inclusive) throws IOException { - iter.seek(range, columnFamilies, inclusive); - if (iter.hasTop()) { - seekDocSource(iter.getTopKey()); - } - } - - private void seekDocSource(Key key) throws IOException { - Key docKey = new Key(key.getRow(), ColumnPrefixes.DOCS_CF_PREFIX, key.getColumnQualifier()); - docSource.seek(new Range(docKey, true, null, false), Collections.<ByteSequence> emptyList(), false); - } - - @Override - public Key getTopKey() { - // from intersecting iterator: - // RowID: shardID - // CF: (empty) - // CQ: docID - return iter.getTopKey(); - } - - @Override - public Value getTopValue() { - if (!iter.hasTop()) { - throw new NoSuchElementException(); - } - - return docSource.getTopValue(); - } - - @Override - public SortedKeyValueIterator<Key, Value> deepCopy(IteratorEnvironment env) { - throw new UnsupportedOperationException(); - } - - public static void setQuery(IteratorSetting cfg, String query) { - cfg.addOption(BooleanTreeIterator.queryOptionName, query); - } - - @Override - public IteratorOptions describeOptions() { - return new IteratorOptions("FreeTextBooleanTree", "Perform a FreeText Query on properly formated table", - Collections.singletonMap(queryOptionName, "the free text query"), - null); - } - - @Override - public boolean validateOptions(Map<String, String> options) { - String q = options.get(queryOptionName); - if (q == null || q.isEmpty()) - throw new IllegalArgumentException(queryOptionName + " must not be empty"); - return true; - } - -} http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/5a03ef61/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/query/ASTExpression.java ---------------------------------------------------------------------- diff --git a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/query/ASTExpression.java b/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/query/ASTExpression.java deleted file mode 100644 index 95783e5..0000000 --- a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/query/ASTExpression.java +++ /dev/null @@ -1,63 +0,0 @@ -package mvm.rya.indexing.accumulo.freetext.query; - -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - - - -/** - * This is a slightly modified version of the ASTExpression file created by JavaCC. This version adds more state to the standard ASTTerm - * file including a "type", and "notFlag". - */ -public class ASTExpression extends SimpleNode { - public static final String AND = "AND"; - public static final String OR = "OR"; - - private String type = ""; - private boolean notFlag = false; - - public ASTExpression(int id) { - super(id); - } - - public ASTExpression(QueryParser p, int id) { - super(p, id); - } - - public void setType(String type) { - this.type = type; - } - - public String getType() { - return type; - } - - public boolean isNotFlag() { - return notFlag; - } - - public void setNotFlag(boolean notFlag) { - this.notFlag = notFlag; - } - - @Override - public String toString() { - return super.toString() + " [type: " + type + ", notFlag: " + notFlag + "]"; - } -} http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/5a03ef61/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/query/ASTNodeUtils.java ---------------------------------------------------------------------- diff --git a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/query/ASTNodeUtils.java b/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/query/ASTNodeUtils.java deleted file mode 100644 index 27edaac..0000000 --- a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/query/ASTNodeUtils.java +++ /dev/null @@ -1,210 +0,0 @@ -package mvm.rya.indexing.accumulo.freetext.query; - -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - - - -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.NoSuchElementException; - -import org.apache.commons.lang.StringUtils; -import org.apache.commons.lang.Validate; - -public class ASTNodeUtils { - - /** - * Serialize a node (and it's children) to a parsable string. - * - * @param s - * @return - */ - public static String serializeExpression(Node s) { - if (s instanceof ASTTerm) { - ASTTerm a = (ASTTerm) s; - return (a.isNotFlag() ? "!" : "") + " " + a.getTerm(); - } - - String prefix = ""; - String suffix = ""; - String join = " "; - if (s instanceof ASTExpression) { - ASTExpression a = (ASTExpression) s; - prefix = (a.isNotFlag() ? "!" : "") + "("; - suffix = ")"; - join = " " + a.getType() + " "; - } - - List<String> children = new ArrayList<String>(); - for (int i = 0; i < s.jjtGetNumChildren(); i++) { - children.add(serializeExpression(s.jjtGetChild(i))); - } - return prefix + StringUtils.join(children, join) + suffix; - - } - - /** - * count the number of terms in this query tree. - * - * @param node - * @return - */ - public static int termCount(Node node) { - if (node instanceof SimpleNode) { - int count = 0; - for (SimpleNode n : getNodeIterator((SimpleNode) node)) { - count += termCount(n); - } - return count; - } else if (node instanceof ASTTerm) { - return 1; - } else { - throw new IllegalArgumentException("Node is of unknown type: " + node.getClass().getName()); - } - } - - /** - * Add the child as the parent's first child. - * - * @param parent - * @param child - */ - public static void pushChild(SimpleNode parent, SimpleNode child) { - // note: this implementation is very coupled with the SimpleNode jjt implementation - int parentSize = parent.jjtGetNumChildren(); - - // expand the parent node - parent.jjtAddChild(null, parentSize); - - // get the current head child - Node currentHeadChild = parent.jjtGetChild(0); - - // set the parameter as the parent's first child - parent.jjtAddChild(child, 0); - - // add the former head child to the end of the list - if (currentHeadChild != null) { - parent.jjtAddChild(currentHeadChild, parentSize); - } - - // tie the child to the parent - child.jjtSetParent(parent); - - } - - /** - * Get the index of the child, -1 if child not found. - * - * @param parent - * @param child - */ - public static int getChildIndex(SimpleNode parent, SimpleNode child) { - int parentSize = parent.jjtGetNumChildren(); - - for (int i = 0; i < parentSize; i++) { - if (child.equals(parent.jjtGetChild(i))) { - return i; - } - } - - return -1; - } - - /** - * return true is all of the node's children have the not flag enabled. - * - * @param node - * @return - */ - public static boolean allChildrenAreNot(ASTExpression node) { - for (SimpleNode child : getNodeIterator(node)) { - if (!isNotFlag(child)) { - return false; - } - } - return true; - } - - /** - * return the node's not flag value. node must be of type {@link ASTTerm} or {@link ASTExpression} - * - * @param node - * @return - */ - public static boolean isNotFlag(Node node) { - if (node instanceof ASTExpression) { - return ((ASTExpression) node).isNotFlag(); - } else if (node instanceof ASTTerm) { - return ((ASTTerm) node).isNotFlag(); - } else { - throw new IllegalArgumentException("Node is of unknown type: " + node.getClass().getName()); - } - } - - public static Iterable<SimpleNode> getNodeIterator(final SimpleNode n) { - return new Iterable<SimpleNode>() { - - @Override - public Iterator<SimpleNode> iterator() { - return new Iterator<SimpleNode>() { - int pointer = 0; - - @Override - public boolean hasNext() { - return pointer < n.jjtGetNumChildren(); - } - - @Override - public SimpleNode next() { - Node rtn = n.jjtGetChild(pointer); - pointer++; - return (SimpleNode) rtn; - } - - @Override - public void remove() { - throw new UnsupportedOperationException(); - } - }; - } - }; - } - - public static void swapChildren(ASTExpression parent, int childOneIndex, int childTwoIndex) { - Validate.isTrue(childOneIndex > -1 && childOneIndex < parent.jjtGetNumChildren()); - Validate.isTrue(childTwoIndex > -1 && childTwoIndex < parent.jjtGetNumChildren()); - - Node childOne = parent.jjtGetChild(childOneIndex); - Node childTwo = parent.jjtGetChild(childTwoIndex); - parent.jjtAddChild(childOne, childTwoIndex); - parent.jjtAddChild(childTwo, childOneIndex); - } - - public static int findFirstNonNotChild(ASTExpression expression) { - for (int i = 0; i < expression.jjtGetNumChildren(); i++) { - if (!isNotFlag(expression.jjtGetChild(i))) { - return i; - } - } - return -1; - } - -}
