[GitHub] [lucene] rmuir commented on a change in pull request #15: LUCENE-8972: Add ICUTransformCharFilter, to support pre-tokenizer ICU text transformation

GitBox Sat, 20 Mar 2021 10:07:46 -0700


rmuir commented on a change in pull request #15:
URL: https://github.com/apache/lucene/pull/15#discussion_r598135382




##########
File path: 
lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUTransformCharFilterFactory.java
##########
@@ -0,0 +1,391 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.icu;
+
+import com.ibm.icu.impl.Utility;
+import com.ibm.icu.text.Normalizer2;
+import com.ibm.icu.text.Transliterator;
+import com.ibm.icu.text.UTF16;
+import java.io.Reader;
+import java.util.ArrayDeque;
+import java.util.Arrays;
+import java.util.Locale;
+import java.util.Map;
+import java.util.WeakHashMap;
+import org.apache.lucene.analysis.CharFilterFactory;
+
+/**
+ * Factory for {@link ICUTransformCharFilter}.
+ *
+ * <p>Supports the following attributes:
+ *
+ * <ul>
+ *   <li>id (mandatory): A Transliterator ID, one from {@link 
Transliterator#getAvailableIDs()}
+ *   <li>direction (optional): Either 'forward' or 'reverse'. Default is 
forward.
+ * </ul>
+ *
+ * @see Transliterator
+ * @since 8.3.0
+ * @lucene.spi {@value #NAME}
+ */
+public class ICUTransformCharFilterFactory extends CharFilterFactory {
+
+  /** SPI name */
+  public static final String NAME = "icuTransform";
+
+  static final String MAX_ROLLBACK_BUFFER_CAPACITY_ARGNAME = 
"maxRollbackBufferCapacity";
+  static final String FAIL_ON_ROLLBACK_BUFFER_OVERFLOW_ARGNAME = 
"failOnRollbackBufferOverflow";
+  static final String SUPPRESS_UNICODE_NORMALIZATION_EXTERNALIZATION_ARGNAME =
+      "suppressUnicodeNormalizationExternalization";
+  private final NormType leading;
+  private final Transliterator transliterator;
+  private final NormType trailing;
+  private final int maxRollbackBufferCapacity;
+  private final boolean failOnRollbackBufferOverflow;
+
+  // TODO: add support for custom rules
+  /** Creates a new ICUTransformFilterFactory */
+  public ICUTransformCharFilterFactory(Map<String, String> args) {
+    super(args);
+    String id = require(args, "id");
+    String direction =
+        get(args, "direction", Arrays.asList("forward", "reverse"), "forward", 
false);
+    int dir = "forward".equals(direction) ? Transliterator.FORWARD : 
Transliterator.REVERSE;
+    int tmpCapacityHint =
+        getInt(
+            args,
+            MAX_ROLLBACK_BUFFER_CAPACITY_ARGNAME,
+            ICUTransformCharFilter.DEFAULT_MAX_ROLLBACK_BUFFER_CAPACITY);
+    this.maxRollbackBufferCapacity = tmpCapacityHint == -1 ? Integer.MAX_VALUE 
: tmpCapacityHint;
+    this.failOnRollbackBufferOverflow =
+        getBoolean(
+            args,
+            FAIL_ON_ROLLBACK_BUFFER_OVERFLOW_ARGNAME,
+            ICUTransformCharFilter.DEFAULT_FAIL_ON_ROLLBACK_BUFFER_OVERFLOW);
+    boolean suppressUnicodeNormalizationExternalization =
+        getBoolean(args, 
SUPPRESS_UNICODE_NORMALIZATION_EXTERNALIZATION_ARGNAME, false);
+    Transliterator stockTransliterator = Transliterator.getInstance(id, dir);
+    if (suppressUnicodeNormalizationExternalization) {
+      this.leading = null;
+      this.transliterator = stockTransliterator;
+      this.trailing = null;
+    } else {
+      ExternalNormalization ext = 
externalizeUnicodeNormalization(stockTransliterator);
+      this.leading = ext.leading;
+      this.transliterator = ext.t;
+      this.trailing = ext.trailing;
+    }
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+
+  private static final Reader wrapReader(NormType normType, Reader r) {
+    if (normType == null) {
+      return r;
+    }
+    switch (normType) {
+      case NFC:
+        return new ICUNormalizer2CharFilter(r, Normalizer2.getNFCInstance());
+      case NFD:
+        return new ICUNormalizer2CharFilter(r, Normalizer2.getNFDInstance());
+      case NFKC:
+        return new ICUNormalizer2CharFilter(r, Normalizer2.getNFKCInstance());
+      case NFKD:
+        return new ICUNormalizer2CharFilter(r, Normalizer2.getNFKDInstance());
+      default:
+        throw new UnsupportedOperationException(
+            "test not yet able to compensate externally for normalization type 
\""
+                + normType
+                + "\"");
+    }
+  }
+
+  /** Default ctor for compatibility with SPI */
+  public ICUTransformCharFilterFactory() {
+    throw defaultCtorException();
+  }
+
+  @Override
+  public Reader create(Reader input) {
+    input = wrapReader(leading, input);
+    input =
+        new ICUTransformCharFilter(
+            input, transliterator, maxRollbackBufferCapacity, 
failOnRollbackBufferOverflow);
+    return wrapReader(trailing, input);
+  }
+
+  private static final boolean ESCAPE_UNPRINTABLE = false;
+
+  /**
+   * Attempts to detect and externalize any leading or trailing top-level 
Unicode normalization. In
+   * the event that such normalization is detected, a new "core" 
Transliterator (with any detected
+   * pre-/post-normalization removed) is created and returned.
+   *
+   * <p>The creation of any new "core" Transliterator (and much of the actual 
code in this method)
+   * is based on the {@link 
com.ibm.icu.text.CompoundTransliterator#toRules(boolean)} method (with
+   * the boolean arg replaced by {@link #ESCAPE_UNPRINTABLE} -- always 
<code>false</code> in this
+   * context).
+   *
+   * @param t the Transliterator to base modified rules on.
+   * @return simple ExternalNormalization struct containing a non-null 
Transliterator, if possible
+   *     with any leading and trailing Unicode normalization externalized. The 
effect of applying
+   *     the resulting leading Unicode norm, Transliterator, and trailing 
Unicode norm, should be
+   *     equivalent to the effect of applying the input Transliterator t.
+   */
+  private static ExternalNormalization 
externalizeUnicodeNormalization(Transliterator t) {
+    final Transliterator[] trans = t.getElements();
+    final String topLevelId = t.getID();
+    final int start;
+    final int limit;
+    final NormType leading;
+    final NormType trailing;
+    if (trans.length == 1) {
+      warnNestedUnicodeNormalization(trans[0], topLevelId, true);
+      return new ExternalNormalization(null, t, null);
+    } else {
+      final int lastIndex;
+      if ((leading = unicodeNormalizationType(trans[0].getID())) != null) {
+        start = 1;
+        limit =
+            (trailing = unicodeNormalizationType(trans[lastIndex = 
trans.length - 1].getID()))
+                    != null
+                ? lastIndex
+                : trans.length;
+      } else if (warnNestedUnicodeNormalization(trans[0], topLevelId, true)
+          && (trailing = unicodeNormalizationType(trans[lastIndex = 
trans.length - 1].getID()))
+              != null) {
+        start = 0;
+        limit = lastIndex;
+      } else {
+        warnNestedUnicodeNormalization(trans[trans.length - 1], topLevelId, 
false);
+        return new ExternalNormalization(null, t, null);
+      }
+    }
+    // We do NOT call toRules() on our component transliterators, in
+    // general. If we have several rule-based transliterators, this
+    // yields a concatenation of the rules -- not what we want. We do
+    // handle compound RBT transliterators specially -- those for which
+    // compoundRBTIndex >= 0. For the transliterator at compoundRBTIndex,
+    // we do call toRules() recursively.
+    StringBuilder rulesSource = new StringBuilder();
+    if (t.getFilter() != null) {
+      // We might be a compound RBT and if we have a global
+      // filter, then emit it at the top.
+      
rulesSource.append("::").append(t.getFilter().toPattern(ESCAPE_UNPRINTABLE)).append(ID_DELIM);
+    }
+    final int globalFilterEnd = rulesSource.length();
+    boolean hasAnonymousRBTs = false;
+    for (int i = start; i < limit; ++i) {
+      String rule;
+
+      // Anonymous RuleBasedTransliterators (inline rules and
+      // ::BEGIN/::END blocks) are given IDs that begin with
+      // "%Pass": use toRules() to write all the rules to the output
+      // (and insert "::Null;" if we have two in a row)
+      if (trans[i].getID().startsWith("%Pass")) {
+        hasAnonymousRBTs = true;
+        rule = trans[i].toRules(ESCAPE_UNPRINTABLE);
+        if (i > start && trans[i - 1].getID().startsWith("%Pass")) rule = 
"::Null;" + rule;
+
+        // we also use toRules() on CompoundTransliterators (which we
+        // check for by looking for a semicolon in the ID)-- this gets
+        // the list of their child transliterators output in the right
+        // format
+      } else if (trans[i].getID().indexOf(';') >= 0) {
+        rule = trans[i].toRules(ESCAPE_UNPRINTABLE);
+
+        // for everything else, use baseToRules()
+      } else {
+        rule = baseToRules(ESCAPE_UNPRINTABLE, trans[i]);
+      }
+      _smartAppend(rulesSource, '\n');
+      rulesSource.append(rule);
+      _smartAppend(rulesSource, ID_DELIM);
+    }
+    // Analogous to the contract for {@link 
com.ibm.icu.text.Transliterator#toRules(boolean)}, the
+    // modified rules String should be sufficient to recreate a Transliterator 
based on the
+    // specified input Transliterator, via {@link
+    // com.ibm.icu.text.Transliterator#createFromRules(String, String, int)}.
+    final String modifiedRules =
+        hasAnonymousRBTs ? rulesSource.toString() : 
rulesSource.substring(globalFilterEnd);
+    String baseId = t.getID();
+    String modId = baseId.concat(baseId.lastIndexOf('/') < 0 ? "/X_NO_NORM_IO" 
: "_X_NO_NORM_IO");
+    Transliterator replacement =
+        Transliterator.createFromRules(modId, modifiedRules, 
Transliterator.FORWARD);
+    return new ExternalNormalization(leading, replacement, trailing);
+  }
+
+  static class ExternalNormalization {
+    private final NormType leading;
+    private final Transliterator t;
+    private final NormType trailing;
+
+    private ExternalNormalization(NormType leading, Transliterator t, NormType 
trailing) {
+      this.leading = leading;
+      this.t = t;
+      this.trailing = trailing;
+    }
+  }
+
+  /**
+   * It is possible that leading and trailing (or singleton) Transliterators 
might apply nested
+   * Unicode normalization, thus acting in the capacity of a Normalizer, 
without qualifying as a
+   * top-level Normalizer as currently defined in {@link 
#unicodeNormalizationType(String)}. For
+   * now, we will emit a warning if users request stripping of i/o unicode 
normalization in such a
+   * case (to facilitate reporting and more nuanced handling in the future, if 
necessary).
+   *
+   * @param levelOne a level one component Transliterator to test for nested 
unicode normalization.
+   * @param topLevelId top level Transliterator parent id.
+   * @param leading true if leading component transliterator; false implies 
trailing component
+   * @return always return true, for easy integration with control flow.
+   */
+  private static boolean warnNestedUnicodeNormalization(

Review comment:
       It is not helpful to print/warn/log/populate-weak-maps, instead let's 
replace with a junit test that loops through Transliterator.getAvailableIDs or 
similar and asserts stuff about each one, failing if it doesn't meet 
expectations. This way, tests fail on icu version upgrade if something is newly 
introduced that must be dealt with.
   




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

[GitHub] [lucene] rmuir commented on a change in pull request #15: LUCENE-8972: Add ICUTransformCharFilter, to support pre-tokenizer ICU text transformation

Reply via email to