mawiesne commented on code in PR #568: URL: https://github.com/apache/opennlp/pull/568#discussion_r1430325906
########## opennlp-tools/src/main/java/opennlp/tools/util/StringList.java: ########## @@ -33,19 +35,19 @@ public class StringList implements Iterable<String> { * Initializes a {@link StringList} instance. * <p> * Note: <br> - * Token String will be replaced by identical internal String object. + * Token String will be interened via {@link StringInterners}. * * @param singleToken One single token */ public StringList(String singleToken) { - tokens = new String[]{singleToken.intern()}; + tokens = new String[]{StringInterners.intern(singleToken)}; } /** * Initializes a {@link StringList} instance. * <p> * Note: <br> - * Token Strings will be replaced by identical internal String object. + * Token Strings will be interened via {@link StringInterners}. Review Comment: Should read `interned`, minus the extra `e`. ########## opennlp-tools/src/main/java/opennlp/tools/util/StringList.java: ########## @@ -33,19 +35,19 @@ public class StringList implements Iterable<String> { * Initializes a {@link StringList} instance. * <p> * Note: <br> - * Token String will be replaced by identical internal String object. + * Token String will be interened via {@link StringInterners}. Review Comment: Should read `interned`, minus the extra `e`. ########## opennlp-tools/src/main/java/opennlp/tools/util/jvm/CHMStringDeduplicator.java: ########## @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.jvm; + + +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ThreadLocalRandom; + +import opennlp.tools.commons.Internal; +import opennlp.tools.commons.ThreadSafe; + +/** + * A {@link StringInterner} implementation by Aleksey Shipilëv with relaxed canonical requirements. + * It is a probabilistic deduplication implementation with a default probability of {@code 0.5}. + * Users may implement a custom class with a different probability value. + * <p> + * Origin: + * <a href="https://shipilev.net/talks/joker-Oct2014-string-catechism.pdf"> + * https://shipilev.net/talks/joker-Oct2014-string-catechism.pdf</a> + * <p> Review Comment: No extra `<p>` required here. ########## opennlp-tools/src/main/java/opennlp/tools/util/jvm/CHMStringDeduplicator.java: ########## @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.jvm; + + +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ThreadLocalRandom; + +import opennlp.tools.commons.Internal; +import opennlp.tools.commons.ThreadSafe; + +/** + * A {@link StringInterner} implementation by Aleksey Shipilëv with relaxed canonical requirements. + * It is a probabilistic deduplication implementation with a default probability of {@code 0.5}. + * Users may implement a custom class with a different probability value. + * <p> + * Origin: + * <a href="https://shipilev.net/talks/joker-Oct2014-string-catechism.pdf"> + * https://shipilev.net/talks/joker-Oct2014-string-catechism.pdf</a> + * <p> + */ +@Internal +@ThreadSafe +class CHMStringDeduplicator implements StringInterner { + private final int prob; + private final Map<String, String> map; + + public CHMStringDeduplicator() { Review Comment: Could have a JavaDoc comment mentioning what the default value for probability will be, if this constructor is used. ########## opennlp-tools/src/main/java/opennlp/tools/util/jvm/CHMStringInterner.java: ########## @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.util.jvm; + +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +import opennlp.tools.commons.Internal; +import opennlp.tools.commons.ThreadSafe; + +/** + * A {@link StringInterner} implementation based on {@link ConcurrentHashMap} by Aleksey Shipilëv. + * <p> + * Origin: + * <a href="https://shipilev.net/jvm/anatomy-quarks/10-string-intern/"> + * https://shipilev.net/jvm/anatomy-quarks/10-string-intern/</a> + * <p> Review Comment: No extra `<p>` required here. ########## opennlp-tools/src/main/java/opennlp/tools/util/jvm/HMStringInterner.java: ########## @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.jvm; + +import java.util.HashMap; +import java.util.Map; + +import opennlp.tools.commons.Internal; + +/** + * A {@link StringInterner} implementation based on {@link HashMap} by Aleksey Shipilëv. + * This implementation is <b>not</b> thread-safe. + * <p> + * Origin: + * <a href="https://shipilev.net/jvm/anatomy-quarks/10-string-intern/"> + * https://shipilev.net/jvm/anatomy-quarks/10-string-intern/</a> + * <p> Review Comment: No extra `<p>` required here. ########## opennlp-tools/src/main/java/opennlp/tools/util/jvm/JvmStringInterner.java: ########## @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.jvm; + +import opennlp.tools.commons.Internal; +import opennlp.tools.commons.ThreadSafe; + +/** + * A {@link StringInterner} implementation based on {@code String.intern()}. Strings interned via + * this implementation are put into PermGen space. If needed, the PermGen memory can be increased by + * the JVM argument {@code -XX:MaxPermSize}. + * <p> + * Using this {@link StringInterner} brings back the default behaviour of OpenNLP before version + * {@code 2.3.2}. You can use it by setting the system property {@code opennlp.interner.class} to + * the full qualified classname of a {@link StringInterner} implementation. Review Comment: `full` should read `fully` qualified..., missing `y` ########## opennlp-tools/src/main/java/opennlp/tools/util/jvm/StringInterners.java: ########## @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.util.jvm; + +import java.lang.reflect.Constructor; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Provides string interning utility methods. Interning mechanism can be configured via the + * system property {@code opennlp.interner.class} by specifying an implementation via its + * full qualified classname. It needs to implement {@link StringInterner}. Review Comment: `full` should read `fully` qualified..., missing `y` ########## opennlp-tools/src/main/java/opennlp/tools/util/jvm/StringInterners.java: ########## @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.util.jvm; + +import java.lang.reflect.Constructor; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Provides string interning utility methods. Interning mechanism can be configured via the + * system property {@code opennlp.interner.class} by specifying an implementation via its + * full qualified classname. It needs to implement {@link StringInterner}. + * <p> + * If not specified by the user, the default interner is {@link CHMStringInterner}. + */ +public class StringInterners { + + private static final Logger LOGGER = LoggerFactory.getLogger(StringInterners.class); + private static final StringInterner INTERNER; + + static { + + final String clazzName = System.getProperty("opennlp.interner.class", + CHMStringInterner.class.getCanonicalName()); + + try { + final Class<?> clazz = Class.forName(clazzName); + final Constructor<?> cons = clazz.getDeclaredConstructor(); + INTERNER = (StringInterner) cons.newInstance(); + LOGGER.debug("Using '{}' as String interner implementation.", clazzName); + } catch (Exception e) { + throw new RuntimeException("Could not load specified String interner implementation: '" + + clazzName + "'. Reason: " + e.getLocalizedMessage(), e); + } + + } + + /** + * Interns and returns a reference to the representative instance + * for any of a collection of string instances that are equal to each other. Review Comment: "for any of a collection of string" <- this reads odd to me. Better without "of a": "for any collection of string ..."? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: dev-unsubscr...@opennlp.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org