Repository: commons-text Updated Branches: refs/heads/master aaf4aba36 -> ce4f20e26
Added support for UTF-16 with surrogate pairs Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/15c2e4b2 Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/15c2e4b2 Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/15c2e4b2 Branch: refs/heads/master Commit: 15c2e4b28686edf6f0807304367dba82ac3d359d Parents: aaf4aba Author: Arun Vinud <arunvinud.sivasubramaniansurianaraya...@capitalone.com> Authored: Wed Jul 12 15:47:02 2017 -0400 Committer: Arun Vinud <arunvinud.sivasubramaniansurianaraya...@capitalone.com> Committed: Wed Jul 12 15:47:25 2017 -0400 ---------------------------------------------------------------------- .../java/org/apache/commons/text/WordUtils.java | 27 +++++++++++--------- .../org/apache/commons/text/WordUtilsTest.java | 26 ++++++++++++++----- 2 files changed, 35 insertions(+), 18 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/commons-text/blob/15c2e4b2/src/main/java/org/apache/commons/text/WordUtils.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/WordUtils.java b/src/main/java/org/apache/commons/text/WordUtils.java index 8e96553..123243f 100644 --- a/src/main/java/org/apache/commons/text/WordUtils.java +++ b/src/main/java/org/apache/commons/text/WordUtils.java @@ -24,11 +24,14 @@ import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.Validate; /** - * <p>Operations on Strings that contain words.</p> + * <p> + * Operations on Strings that contain words. + * </p> * - * <p>This class tries to handle <code>null</code> input gracefully. - * An exception will not be thrown for a <code>null</code> input. - * Each method documents its behaviour in more detail.</p> + * <p> + * This class tries to handle <code>null</code> input gracefully. An exception will not be thrown for a + * <code>null</code> input. Each method documents its behavior in more detail. + * </p> * * @since 1.1 */ @@ -688,22 +691,22 @@ public class WordUtils { return ""; } final int strLen = str.length(); - final char[] buf = new char[strLen / 2 + 1]; + final int [] newCodePoints = new int[strLen / 2 + 1]; int count = 0; boolean lastWasGap = true; - for (int i = 0; i < strLen; i++) { - final char ch = str.charAt(i); + for (int i = 0; i < strLen;) { + final int codePoint = str.codePointAt(i); - if (isDelimiter(ch, delimiters)) { + if (isDelimiter(codePoint, delimiters)) { lastWasGap = true; } else if (lastWasGap) { - buf[count++] = ch; + newCodePoints[count++] = codePoint; lastWasGap = false; - } else { - continue; // ignore ch } + + i += Character.charCount(codePoint); } - return new String(buf, 0, count); + return new String(newCodePoints, 0, count); } //----------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/commons-text/blob/15c2e4b2/src/test/java/org/apache/commons/text/WordUtilsTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/commons/text/WordUtilsTest.java b/src/test/java/org/apache/commons/text/WordUtilsTest.java index 271a8f0..beb063a 100644 --- a/src/test/java/org/apache/commons/text/WordUtilsTest.java +++ b/src/test/java/org/apache/commons/text/WordUtilsTest.java @@ -16,16 +16,13 @@ */ package org.apache.commons.text; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; +import org.apache.commons.lang3.StringUtils; +import org.junit.Test; import java.lang.reflect.Constructor; import java.lang.reflect.Modifier; -import org.apache.commons.lang3.StringUtils; -import org.junit.Test; +import static org.junit.Assert.*; /** * Unit tests for {@link WordUtils} class. @@ -412,6 +409,23 @@ public class WordUtilsTest { assertEquals("i2", WordUtils.initials("i am here 123", array)); } + @Test + public void testInitialsSurrogatePairs() { + //Tests with space as default delimiter + assertEquals("\uD800\uDF00\uD800\uDF02", WordUtils.initials("\uD800\uDF00\uD800\uDF01 \uD800\uDF02\uD800\uDF03")); + assertEquals("\uD800\uDF00\uD800\uDF02", WordUtils.initials("\uD800\uDF00\uD800\uDF01 \uD800\uDF02\uD800\uDF03", null)); + assertEquals("\uD800\uDF00\uD800\uDF02", WordUtils.initials("\uD800\uDF00 \uD800\uDF02 ", null)); + + //Tests with UTF-16 as delimiters + assertEquals("\uD800\uDF00\uD800\uDF02", WordUtils.initials("\uD800\uDF00\uD800\uDF01.\uD800\uDF02\uD800\uDF03", new char[]{'.'})); + assertEquals("\uD800\uDF00\uD800\uDF02", WordUtils.initials("\uD800\uDF00\uD800\uDF01A\uD800\uDF02\uD800\uDF03", new char[]{'A'})); + + //Tests with UTF-32 as delimiters + assertEquals("\uD800\uDF00\uD800\uDF02", WordUtils.initials("\uD800\uDF00\uD800\uDF01\uD800\uDF14\uD800\uDF02\uD800\uDF03", new char[]{'\uD800', '\uDF14'})); + assertEquals("\uD800\uDF00\uD800\uDF02", WordUtils.initials("\uD800\uDF00\uD800\uDF01\uD800\uDF14\uD800\uDF18\uD800\uDF02\uD800\uDF03", new char[]{'\uD800', '\uDF14', '\uD800', '\uDF18'})); + + } + // ----------------------------------------------------------------------- @Test public void testSwapCase_String() {