[jira] Updated: (JCR-2818) Upgrade spellChecker for 2.x

Zhou Wu (JIRA) Tue, 23 Nov 2010 10:45:45 -0800

     [ 
https://issues.apache.org/jira/browse/JCR-2818?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]


Zhou Wu updated JCR-2818:
-------------------------

    Status: Patch Available  (was: Open)

A patch for fixing the problem.


Index: LuceneSpellChecker.java
===================================================================
--- LuceneSpellChecker.java     (revision 1037959)
+++ LuceneSpellChecker.java     (working copy)
@@ -19,9 +19,9 @@
 import org.apache.jackrabbit.core.query.lucene.SearchIndex;
 import org.apache.jackrabbit.core.query.lucene.FieldNames;
 import org.apache.jackrabbit.core.query.QueryHandler;
-import org.apache.jackrabbit.core.query.QueryRootNode;
-import org.apache.jackrabbit.core.query.TraversingQueryNodeVisitor;
-import org.apache.jackrabbit.core.query.RelationQueryNode;
+import org.apache.jackrabbit.spi.commons.query.QueryRootNode;
+import org.apache.jackrabbit.spi.commons.query.TraversingQueryNodeVisitor;
+import org.apache.jackrabbit.spi.commons.query.RelationQueryNode;
 import org.apache.lucene.search.spell.SpellChecker;
 import org.apache.lucene.search.spell.Dictionary;
 import org.apache.lucene.search.spell.LuceneDictionary;
@@ -40,6 +40,7 @@
 import java.io.File;
 import java.util.ArrayList;
 import java.util.List;
+import javax.jcr.RepositoryException;
 
 /**
  * <code>LuceneSpellChecker</code> implements a spell checker based on the 
terms
@@ -48,351 +49,360 @@
 public class LuceneSpellChecker
         implements org.apache.jackrabbit.core.query.lucene.SpellChecker {
 
-    /**
-     * Logger instance for this class.
-     */
-    private static final Logger log = 
LoggerFactory.getLogger(LuceneSpellChecker.class);
+  /**
+   * Logger instance for this class.
+   */
+  private static final Logger log = 
LoggerFactory.getLogger(LuceneSpellChecker.class);
 
-    public static final class FiveSecondsRefreshInterval extends 
LuceneSpellChecker {
-        public FiveSecondsRefreshInterval() {
-            super(5 * 1000);
-        }
+  public static final class FiveSecondsRefreshInterval extends 
LuceneSpellChecker {
+
+    public FiveSecondsRefreshInterval() {
+      super(5 * 1000);
     }
+  }
 
-    public static final class OneMinuteRefreshInterval extends 
LuceneSpellChecker {
-        public OneMinuteRefreshInterval() {
-            super(60 * 1000);
-        }
+  public static final class OneMinuteRefreshInterval extends 
LuceneSpellChecker {
+
+    public OneMinuteRefreshInterval() {
+      super(60 * 1000);
     }
+  }
 
-    public static final class FiveMinutesRefreshInterval extends 
LuceneSpellChecker {
-        public FiveMinutesRefreshInterval() {
-            super(5 * 60 * 1000);
-        }
+  public static final class FiveMinutesRefreshInterval extends 
LuceneSpellChecker {
+
+    public FiveMinutesRefreshInterval() {
+      super(5 * 60 * 1000);
     }
+  }
 
-    public static final class ThirtyMinutesRefreshInterval extends 
LuceneSpellChecker {
-        public ThirtyMinutesRefreshInterval() {
-            super(30 * 60 * 1000);
-        }
+  public static final class ThirtyMinutesRefreshInterval extends 
LuceneSpellChecker {
+
+    public ThirtyMinutesRefreshInterval() {
+      super(30 * 60 * 1000);
     }
+  }
 
-    public static final class OneHourRefreshInterval extends 
LuceneSpellChecker {
-        public OneHourRefreshInterval() {
-            super(60 * 60 * 1000);
-        }
+  public static final class OneHourRefreshInterval extends LuceneSpellChecker {
+
+    public OneHourRefreshInterval() {
+      super(60 * 60 * 1000);
     }
+  }
 
-    public static final class SixHoursRefreshInterval extends 
LuceneSpellChecker {
-        public SixHoursRefreshInterval() {
-            super(6 * 60 * 60 * 1000);
-        }
+  public static final class SixHoursRefreshInterval extends LuceneSpellChecker 
{
+
+    public SixHoursRefreshInterval() {
+      super(6 * 60 * 60 * 1000);
     }
+  }
 
-    public static final class TwelveHoursRefreshInterval extends 
LuceneSpellChecker {
-        public TwelveHoursRefreshInterval() {
-            super(12 * 60 * 60 * 1000);
-        }
+  public static final class TwelveHoursRefreshInterval extends 
LuceneSpellChecker {
+
+    public TwelveHoursRefreshInterval() {
+      super(12 * 60 * 60 * 1000);
     }
+  }
 
-    public static final class OneDayRefreshInterval extends LuceneSpellChecker 
{
-        public OneDayRefreshInterval() {
-            super(24 * 60 * 60 * 1000);
-        }
+  public static final class OneDayRefreshInterval extends LuceneSpellChecker {
+
+    public OneDayRefreshInterval() {
+      super(24 * 60 * 60 * 1000);
     }
+  }
+  /**
+   * The internal spell checker.
+   */
+  private InternalSpellChecker spellChecker;
+  /**
+   * The refresh interval.
+   */
+  private final long refreshInterval;
 
+  /**
+   * Spell checker with a default refresh interval of one hour.
+   */
+  public LuceneSpellChecker() {
+    this(60 * 60 * 1000); // default refresh interval: one hour
+  }
+
+  protected LuceneSpellChecker(long refreshInterval) {
+    this.refreshInterval = refreshInterval;
+  }
+
+  /**
+   * Initializes this spell checker.
+   *
+   * @param handler the query handler that created this spell checker.
+   * @throws IOException if <code>handler</code> is not of type {...@link
+   *                     SearchIndex}.
+   */
+  public void init(QueryHandler handler)
+          throws IOException {
+    if (handler instanceof SearchIndex) {
+      this.spellChecker = new InternalSpellChecker((SearchIndex) handler);
+    } else {
+      throw new IOException("LuceneSpellChecker only works with "
+              + SearchIndex.class.getName());
+    }
+  }
+
+  /**
+   * {...@inheritdoc}
+   */
+  public String check(QueryRootNode aqt) throws IOException {
+    String stmt = null;
+    try {
+      stmt = getFulltextStatement(aqt);
+    } catch (Exception e) {
+    }
+    if (stmt == null) {
+      // no spellcheck operation in query
+      return null;
+    }
+
+    return spellChecker.suggest(stmt);
+  }
+
+  public void close() {
+    spellChecker.close();
+  }
+
+  //------------------------------< internal >--------------------------------
+  /**
+   * Returns the fulltext statement of a spellcheck relation query node or
+   * <code>null</code> if none exists in the abstract query tree.
+   *
+   * @param aqt the abstract query tree.
+   * @return the fulltext statement or <code>null</code>.
+   */
+  private String getFulltextStatement(QueryRootNode aqt) throws 
RepositoryException {
+    final String[] stmt = new String[1];
+    aqt.accept(new TraversingQueryNodeVisitor() {
+
+      public Object visit(RelationQueryNode node, Object o)
+              throws RepositoryException {
+        if (stmt[0] == null && node.getOperation() == 
RelationQueryNode.OPERATION_SPELLCHECK) {
+          stmt[0] = node.getStringValue();
+        }
+        return super.visit(node, o);
+      }
+    }, null);
+    return stmt[0];
+  }
+
+  private final class InternalSpellChecker {
+
     /**
-     * The internal spell checker.
+     * Timestamp when the last refresh was done.
      */
-    private InternalSpellChecker spellChecker;
-
+    private long lastRefresh;
     /**
-     * The refresh interval.
+     * Set to true while a refresh is done in a separate thread.
      */
-    private final long refreshInterval;
+    private boolean refreshing = false;
+    /**
+     * The query handler associated with this spell checker.
+     */
+    private final SearchIndex handler;
+    /**
+     * The directory where the spell index is stored.
+     */
+    private final Directory spellIndexDirectory;
+    /**
+     * The underlying spell checker.
+     */
+    private SpellChecker spellChecker;
 
     /**
-     * Spell checker with a default refresh interval of one hour.
+     * Creates a new internal spell checker.
+     * @param handler the associated query handler.
      */
-    public LuceneSpellChecker() {
-        this(60 * 60 * 1000); // default refresh interval: one hour
+    InternalSpellChecker(SearchIndex handler) throws IOException {
+      this.handler = handler;
+      String path = handler.getPath() + File.separatorChar + "spellchecker";
+      this.spellIndexDirectory = FSDirectory.getDirectory(
+              path, new NativeFSLockFactory(path));
+      if (IndexReader.indexExists(spellIndexDirectory)) {
+        this.lastRefresh = System.currentTimeMillis();
+      }
+      this.spellChecker = new SpellChecker(spellIndexDirectory);
+      refreshSpellChecker();
     }
 
-    protected LuceneSpellChecker(long refreshInterval) {
-        this.refreshInterval = refreshInterval;
-    }
-
     /**
-     * Initializes this spell checker.
+     * Checks a fulltext query statement and suggests a spell checked
+     * version of the statement. If the spell checker thinks the spelling is
+     * correct <code>null</code> is returned.
      *
-     * @param handler the query handler that created this spell checker.
-     * @throws IOException if <code>handler</code> is not of type {...@link
-     *                     SearchIndex}.
+     * @param statement the fulltext query statement.
+     * @return a suggestion or <code>null</code>.
      */
-    public void init(QueryHandler handler)
-            throws IOException {
-        if (handler instanceof SearchIndex) {
-            this.spellChecker = new InternalSpellChecker((SearchIndex) 
handler);
-        } else {
-            throw new IOException("LuceneSpellChecker only works with " +
-                    SearchIndex.class.getName());
-        }
-    }
+    String suggest(String statement) throws IOException {
+      // tokenize the statement (field name doesn't matter actually...)
+      List words = new ArrayList();
+      List tokens = new ArrayList();
+      tokenize(statement, words, tokens);
 
-    /**
-     * {...@inheritdoc}
-     */
-    public String check(QueryRootNode aqt) throws IOException {
-        String stmt = getFulltextStatement(aqt);
-        if (stmt == null) {
-            // no spellcheck operation in query
-            return null;
+      String[] suggestions = check(
+              (String[]) words.toArray(new String[words.size()]));
+      if (suggestions != null) {
+        // replace words in statement in reverse order because length
+        // of statement will change
+        StringBuffer sb = new StringBuffer(statement);
+        for (int i = suggestions.length - 1; i >= 0; i--) {
+          Token t = (Token) tokens.get(i);
+          // only replace if word acutally changed
+          if (!t.termText().equalsIgnoreCase(suggestions[i])) {
+            sb.replace(t.startOffset(), t.endOffset(), suggestions[i]);
+          }
         }
-        return spellChecker.suggest(stmt);
+        return sb.toString();
+      } else {
+        return null;
+      }
     }
 
-    public void close() {
-        spellChecker.close();
+    void close() {
+      try {
+        spellIndexDirectory.close();
+      } catch (IOException e) {
+        // ignore
+      }
+      // urgh, the lucene spell checker cannot be closed explicitly.
+      // finalize will close the reader...
+      spellChecker = null;
     }
 
-    //------------------------------< internal 
>--------------------------------
-
     /**
-     * Returns the fulltext statement of a spellcheck relation query node or
-     * <code>null</code> if none exists in the abstract query tree.
+     * Tokenizes the statement into words and tokens.
      *
-     * @param aqt the abstract query tree.
-     * @return the fulltext statement or <code>null</code>.
+     * @param statement the fulltext query statement.
+     * @param words     this list will be filled with the original words
+     *                  extracted from the statement.
+     * @param tokens    this list will be filled with the tokens parsed from
+     *                  the statement.
+     * @throws IOException if an error occurs while parsing the statement.
      */
-    private String getFulltextStatement(QueryRootNode aqt) {
-        final String[] stmt = new String[1];
-        aqt.accept(new TraversingQueryNodeVisitor() {
-            public Object visit(RelationQueryNode node, Object o) {
-                if (stmt[0] == null && node.getOperation() == 
RelationQueryNode.OPERATION_SPELLCHECK) {
-                    stmt[0] = node.getStringValue();
-                }
-                return super.visit(node, o);
+    private void tokenize(String statement, List words, List tokens)
+            throws IOException {
+      TokenStream ts = handler.getTextAnalyzer().tokenStream(
+              FieldNames.FULLTEXT, new StringReader(statement));
+      try {
+        Token t;
+        while ((t = ts.next()) != null) {
+          String origWord = statement.substring(t.startOffset(), 
t.endOffset());
+          if (t.getPositionIncrement() > 0) {
+            words.add(t.termText());
+            tokens.add(t);
+          } else {
+            // very simple implementation: use termText with length
+            // closer to original word
+            Token current = (Token) tokens.get(tokens.size() - 1);
+            if (Math.abs(origWord.length() - current.termText().length())
+                    > Math.abs(origWord.length() - t.termText().length())) {
+              // replace current token and word
+              words.set(words.size() - 1, t.termText());
+              tokens.set(tokens.size() - 1, t);
             }
-        }, null);
-        return stmt[0];
+          }
+        }
+      } finally {
+        ts.close();
+      }
     }
 
-    private final class InternalSpellChecker {
-
-        /**
-         * Timestamp when the last refresh was done.
-         */
-        private long lastRefresh;
-
-        /**
-         * Set to true while a refresh is done in a separate thread.
-         */
-        private boolean refreshing = false;
-
-        /**
-         * The query handler associated with this spell checker.
-         */
-        private final SearchIndex handler;
-
-        /**
-         * The directory where the spell index is stored.
-         */
-        private final Directory spellIndexDirectory;
-
-        /**
-         * The underlying spell checker.
-         */
-        private SpellChecker spellChecker;
-
-        /**
-         * Creates a new internal spell checker.
-         * @param handler the associated query handler.
-         */
-        InternalSpellChecker(SearchIndex handler) throws IOException {
-            this.handler = handler;
-            String path = handler.getPath() + File.separatorChar + 
"spellchecker";
-            this.spellIndexDirectory = FSDirectory.getDirectory(
-                    path, new NativeFSLockFactory(path));
-            if (IndexReader.indexExists(spellIndexDirectory)) {
-                this.lastRefresh = System.currentTimeMillis();
+    /**
+     * Checks the spelling of the passed <code>words</code> and returns a
+     * suggestion.
+     *
+     * @param words the words to check.
+     * @return a suggestion of correctly spelled <code>words</code> or
+     *         <code>null</code> if this spell checker thinks
+     *         <code>words</code> are spelled correctly.
+     * @throws IOException if an error occurs while spell checking.
+     */
+    private String[] check(String words[]) throws IOException {
+      refreshSpellChecker();
+      boolean hasSuggestion = false;
+      IndexReader reader = handler.getIndexReader();
+      try {
+        for (int retries = 0; retries < 100; retries++) {
+          try {
+            String[] suggestion = new String[words.length];
+            for (int i = 0; i < words.length; i++) {
+              String[] similar = spellChecker.suggestSimilar(words[i], 5, 
reader,
+                      FieldNames.FULLTEXT, true);
+              if (similar.length > 0) {
+                suggestion[i] = similar[0];
+                hasSuggestion = true;
+              } else {
+                suggestion[i] = words[i];
+              }
             }
-            this.spellChecker = new SpellChecker(spellIndexDirectory);
-            refreshSpellChecker();
-        }
-
-        /**
-         * Checks a fulltext query statement and suggests a spell checked
-         * version of the statement. If the spell checker thinks the spelling 
is
-         * correct <code>null</code> is returned.
-         *
-         * @param statement the fulltext query statement.
-         * @return a suggestion or <code>null</code>.
-         */
-        String suggest(String statement) throws IOException {
-            // tokenize the statement (field name doesn't matter actually...)
-            List words = new ArrayList();
-            List tokens = new ArrayList();
-            tokenize(statement, words, tokens);
-
-            String[] suggestions = check(
-                    (String[]) words.toArray(new String[words.size()]));
-            if (suggestions != null) {
-                // replace words in statement in reverse order because length
-                // of statement will change
-                StringBuffer sb = new StringBuffer(statement);
-                for (int i = suggestions.length - 1; i >= 0; i--) {
-                    Token t = (Token) tokens.get(i);
-                    // only replace if word acutally changed
-                    if (!t.termText().equalsIgnoreCase(suggestions[i])) {
-                        sb.replace(t.startOffset(), t.endOffset(), 
suggestions[i]);
-                    }
-                }
-                return sb.toString();
+            if (hasSuggestion) {
+              log.debug("Successful after {} retries", new Integer(retries));
+              return suggestion;
             } else {
-                return null;
+              return null;
             }
+          } catch (AlreadyClosedException e) {
+            // it may happen that the index reader inside the
+            // spell checker is closed while searching for
+            // suggestions. this is actually a design flaw in the
+            // lucene spell checker, but for now we simply retry
+          }
         }
+        // unsuccessful after retries
+        return null;
+      } finally {
+        reader.close();
+      }
+    }
 
-        void close() {
-            try {
-                spellIndexDirectory.close();
-            } catch (IOException e) {
-                // ignore
-            }
-            // urgh, the lucene spell checker cannot be closed explicitly.
-            // finalize will close the reader...
-            spellChecker = null;
-        }
+    /**
+     * Refreshes the underlying spell checker in a background thread.
+     * Synchronization is done on this <code>LuceneSpellChecker</code> 
instance.
+     * While the refresh takes place {...@link #refreshing} is set to
+     * <code>true</code>.
+     */
+    private void refreshSpellChecker() {
+      if (lastRefresh + refreshInterval < System.currentTimeMillis()) {
+        synchronized (this) {
+          if (refreshing) {
+            return;
+          } else {
+            refreshing = true;
+            Runnable refresh = new Runnable() {
 
-        /**
-         * Tokenizes the statement into words and tokens.
-         *
-         * @param statement the fulltext query statement.
-         * @param words     this list will be filled with the original words
-         *                  extracted from the statement.
-         * @param tokens    this list will be filled with the tokens parsed 
from
-         *                  the statement.
-         * @throws IOException if an error occurs while parsing the statement.
-         */
-        private void tokenize(String statement, List words, List tokens)
-                throws IOException {
-            TokenStream ts = handler.getTextAnalyzer().tokenStream(
-                    FieldNames.FULLTEXT, new StringReader(statement));
-            try {
-                Token t;
-                while ((t = ts.next()) != null) {
-                    String origWord = statement.substring(t.startOffset(), 
t.endOffset());
-                    if (t.getPositionIncrement() > 0) {
-                        words.add(t.termText());
-                        tokens.add(t);
-                    } else {
-                        // very simple implementation: use termText with length
-                        // closer to original word
-                        Token current = (Token) tokens.get(tokens.size() - 1);
-                        if (Math.abs(origWord.length() - 
current.termText().length()) >
-                                Math.abs(origWord.length() - 
t.termText().length())) {
-                            // replace current token and word
-                            words.set(words.size() - 1, t.termText());
-                            tokens.set(tokens.size() - 1, t);
-                        }
+              public void run() {
+                try {
+                  IndexReader reader = handler.getIndexReader();
+                  try {
+                    long time = System.currentTimeMillis();
+                    Dictionary dict = new LuceneDictionary(
+                            reader, FieldNames.FULLTEXT);
+                    log.debug("Starting spell checker index refresh");
+                    spellChecker.indexDictionary(dict);
+                    time = System.currentTimeMillis() - time;
+                    time = time / 1000;
+                    log.info("Spell checker index refreshed in: {} s.",
+                            new Long(time));
+                  } finally {
+                    reader.close();
+                    synchronized (InternalSpellChecker.this) {
+                      refreshing = false;
                     }
+                  }
+                } catch (IOException e) {
+                  // ignore
                 }
-            } finally {
-                ts.close();
-            }
+              }
+            };
+            new Thread(refresh, "SpellChecker Refresh").start();
+            lastRefresh = System.currentTimeMillis();
+          }
         }
-
-        /**
-         * Checks the spelling of the passed <code>words</code> and returns a
-         * suggestion.
-         *
-         * @param words the words to check.
-         * @return a suggestion of correctly spelled <code>words</code> or
-         *         <code>null</code> if this spell checker thinks
-         *         <code>words</code> are spelled correctly.
-         * @throws IOException if an error occurs while spell checking.
-         */
-        private String[] check(String words[]) throws IOException {
-            refreshSpellChecker();
-            boolean hasSuggestion = false;
-            IndexReader reader = handler.getIndexReader();
-            try {
-                for (int retries = 0; retries < 100; retries++) {
-                    try {
-                        String[] suggestion = new String[words.length];
-                        for (int i = 0; i < words.length; i++) {
-                            String[] similar = 
spellChecker.suggestSimilar(words[i], 5, reader,
-                                    FieldNames.FULLTEXT, true);
-                            if (similar.length > 0) {
-                                suggestion[i] = similar[0];
-                                hasSuggestion = true;
-                            } else {
-                                suggestion[i] = words[i];
-                            }
-                        }
-                        if (hasSuggestion) {
-                            log.debug("Successful after {} retries", new 
Integer(retries));
-                            return suggestion;
-                        } else {
-                            return null;
-                        }
-                    } catch (AlreadyClosedException e) {
-                        // it may happen that the index reader inside the
-                        // spell checker is closed while searching for
-                        // suggestions. this is actually a design flaw in the
-                        // lucene spell checker, but for now we simply retry
-                    }
-                }
-                // unsuccessful after retries
-                return null;
-            } finally {
-                reader.close();
-            }
-        }
-
-        /**
-         * Refreshes the underlying spell checker in a background thread.
-         * Synchronization is done on this <code>LuceneSpellChecker</code> 
instance.
-         * While the refresh takes place {...@link #refreshing} is set to
-         * <code>true</code>.
-         */
-        private void refreshSpellChecker() {
-            if (lastRefresh + refreshInterval < System.currentTimeMillis()) {
-                synchronized (this) {
-                    if (refreshing) {
-                        return;
-                    } else {
-                        refreshing = true;
-                        Runnable refresh = new Runnable() {
-                            public void run() {
-                                try {
-                                    IndexReader reader = 
handler.getIndexReader();
-                                    try {
-                                        long time = System.currentTimeMillis();
-                                        Dictionary dict = new LuceneDictionary(
-                                                reader, FieldNames.FULLTEXT);
-                                        log.debug("Starting spell checker 
index refresh");
-                                        spellChecker.indexDictionary(dict);
-                                        time = System.currentTimeMillis() - 
time;
-                                        time = time / 1000;
-                                        log.info("Spell checker index 
refreshed in: {} s.",
-                                                new Long(time));
-                                    } finally {
-                                        reader.close();
-                                        synchronized 
(InternalSpellChecker.this) {
-                                            refreshing = false;
-                                        }
-                                    }
-                                } catch (IOException e) {
-                                    // ignore
-                                }
-                            }
-                        };
-                        new Thread(refresh, "SpellChecker Refresh").start();
-                        lastRefresh = System.currentTimeMillis();
-                    }
-                }
-            }
-        }
+      }
     }
+  }
 }


> Upgrade  spellChecker  for 2.x 
> -------------------------------
>
>                 Key: JCR-2818
>                 URL: https://issues.apache.org/jira/browse/JCR-2818
>             Project: Jackrabbit Content Repository
>          Issue Type: Bug
>          Components: jackrabbit-core
>    Affects Versions: 2.1.2, 2.1.1, 2.1.0, 2.0.3, 2.0.0
>            Reporter: Zhou Wu
>
> LuceneSpellChecker is outdated. It needs  changes for 2.x version. 

-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.

[jira] Updated: (JCR-2818) Upgrade spellChecker for 2.x

Reply via email to