This is an automated email from the ASF dual-hosted git repository.
tommaso pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/master by this push:
new 1e851de no jira - added javadoc to normalizers (#285)
1e851de is described below
commit 1e851de8fe60a7122bd7b75b4278bf23bd0ec12d
Author: Tommaso Teofili <[email protected]>
AuthorDate: Sun Nov 5 17:02:42 2017 +0100
no jira - added javadoc to normalizers (#285)
No Jira - Add javadoc to normalizers
---
.../tools/util/normalizer/CharSequenceNormalizer.java | 15 +++++++++++++--
.../util/normalizer/EmojiCharSequenceNormalizer.java | 8 ++++----
.../util/normalizer/NumberCharSequenceNormalizer.java | 5 +++--
.../util/normalizer/ShrinkCharSequenceNormalizer.java | 5 +++--
.../util/normalizer/TwitterCharSequenceNormalizer.java | 5 +++--
.../tools/util/normalizer/UrlCharSequenceNormalizer.java | 5 +++--
6 files changed, 29 insertions(+), 14 deletions(-)
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/CharSequenceNormalizer.java
b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/CharSequenceNormalizer.java
index b5c1f3f..e09578c 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/CharSequenceNormalizer.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/CharSequenceNormalizer.java
@@ -14,10 +14,21 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
-
package opennlp.tools.util.normalizer;
+/**
+ * A char sequence normalizer, used to adjusting (prune, substitute, add, etc.)
+ * characters in order to remove noise from text
+ *
+ * @see <a href="https://en.wikipedia.org/wiki/Text_normalization">Text
normalization</a>
+ *
+ */
public interface CharSequenceNormalizer {
+
+ /**
+ * normalize a sequence of characters
+ * @param text the char sequence to normalize
+ * @return the normalized char sequence
+ */
CharSequence normalize(CharSequence text);
}
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizer.java
b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizer.java
index d1c161c..c7e66e3 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizer.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizer.java
@@ -14,12 +14,13 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
-
package opennlp.tools.util.normalizer;
import java.util.regex.Pattern;
+/**
+ * Normalizer for emojis.
+ */
public class EmojiCharSequenceNormalizer implements CharSequenceNormalizer {
private static final EmojiCharSequenceNormalizer INSTANCE = new
EmojiCharSequenceNormalizer();
@@ -32,7 +33,6 @@ public class EmojiCharSequenceNormalizer implements
CharSequenceNormalizer {
Pattern.compile("[\\uD83C-\\uDBFF\\uDC00-\\uDFFF]+");
public CharSequence normalize (CharSequence text) {
- String modified = EMOJI_REGEX.matcher(text).replaceAll(" ");
- return modified;
+ return EMOJI_REGEX.matcher(text).replaceAll(" ");
}
}
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizer.java
b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizer.java
index 6b0452d..5fe0f62 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizer.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizer.java
@@ -14,12 +14,13 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
-
package opennlp.tools.util.normalizer;
import java.util.regex.Pattern;
+/**
+ * Normalizer for numbers
+ */
public class NumberCharSequenceNormalizer implements CharSequenceNormalizer {
private static final Pattern NUMBER_REGEX = Pattern.compile("\\d+");
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizer.java
b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizer.java
index 6183367..cc1c15e 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizer.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizer.java
@@ -14,12 +14,13 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
-
package opennlp.tools.util.normalizer;
import java.util.regex.Pattern;
+/**
+ * Normalizer to shrink repeated spaces / chars
+ */
public class ShrinkCharSequenceNormalizer implements CharSequenceNormalizer {
private static final Pattern REPEATED_CHAR_REGEX =
Pattern.compile("(.)\\1{2,}",
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizer.java
b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizer.java
index b5a8625..69c7068 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizer.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizer.java
@@ -14,12 +14,13 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
-
package opennlp.tools.util.normalizer;
import java.util.regex.Pattern;
+/**
+ * Normalizer for Twitter character sequences
+ */
public class TwitterCharSequenceNormalizer implements CharSequenceNormalizer {
private static final Pattern HASH_USER_REGEX =
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizer.java
b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizer.java
index 4be9b63..847f86d 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizer.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizer.java
@@ -14,12 +14,13 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
-
package opennlp.tools.util.normalizer;
import java.util.regex.Pattern;
+/**
+ * Normalizer that removes URls and email addresses.
+ */
public class UrlCharSequenceNormalizer implements CharSequenceNormalizer {
private static final Pattern URL_REGEX =
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].