gsmiller commented on code in PR #12320:
URL: https://github.com/apache/lucene/pull/12320#discussion_r1205833951
##########
lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java:
##########
@@ -308,17 +316,84 @@ private void replaceOrRegister(State state) {
}
}
- /**
- * Add a suffix of <code>current</code> starting at <code>fromIndex</code>
(inclusive) to state
- * <code>state</code>.
- */
- private void addSuffix(State state, CharSequence current, int fromIndex) {
- final int len = current.length();
- while (fromIndex < len) {
- int cp = Character.codePointAt(current, fromIndex);
- state = state.newState(cp);
- fromIndex += Character.charCount(cp);
+ private static class CharacterBasedBuilder extends
DaciukMihovAutomatonBuilder {
+ private final CharsRefBuilder scratch = new CharsRefBuilder();
+
+ @Override
+ protected void doAdd(BytesRef current) {
+ // Convert the input UTF-8 bytes to CharsRef so we can use the code
points as our transition
+ // labels.
+ scratch.copyUTF8Bytes(current);
+ CharsRef currentChars = scratch.get();
+
+ // Descend in the automaton (find matching prefix).
+ int pos = 0, max = currentChars.length();
+ State state = root;
+ for (; ; ) {
+ assert pos <= max;
+ if (pos == max) {
+ break;
+ }
+
+ int codePoint = Character.codePointAt(currentChars, pos);
+ State next = state.lastChild(codePoint);
+ if (next == null) {
+ break;
+ }
+
+ state = next;
+ pos += Character.charCount(codePoint);
+ }
+
+ if (state.hasChildren()) replaceOrRegister(state);
+
+ addSuffix(state, currentChars, pos);
+ }
+
+ /**
+ * Add a suffix of <code>current</code> starting at <code>fromIndex</code>
(inclusive) to state
+ * <code>state</code>.
+ */
+ private static void addSuffix(State state, CharSequence current, int
fromIndex) {
+ final int len = current.length();
+ while (fromIndex < len) {
+ int cp = Character.codePointAt(current, fromIndex);
+ state = state.newState(cp);
Review Comment:
This is minimizing through the `replaceOrRegister` method that gets called
when "moving on" to a new suffix. Once the common prefix has been found, we can
minimize its most recently added transition since it's now immutable (thanks to
adding terms in sorted order).
Also, +1 to the idea of building to an `Automaton` directly instead of going
through `convert` at the end.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]