Hello Gary, I updated the unit test, and removed the guessing part, I think. This page shows nicely how the Family Grapheme is composed https://utf-8-visualizer.ardis.lu/?q=%F0%9F%91%A8%F0%9F%8F%BB%E2%80%8D%F0%9F%91%A9%F0%9F%8F%BB%E2%80%8D%F0%9F%91%A6%F0%9F%8F%BB%E2%80%8D%F0%9F%91%A6%F0%9F%8F%BB
Kind regards Carsten import org.apache.commons.lang3.StringUtils; import org.junit.Test; import static org.junit.Assert.*; public class AbbreviateTest { String[] expectedResultsFox = { "🦊...", // 4 "🦊🦊...", "🦊🦊🦊...", "🦊🦊🦊🦊...", "🦊🦊🦊🦊🦊...", "🦊🦊🦊🦊🦊🦊...", "🦊🦊🦊🦊🦊🦊🦊...", // 10 }; String[] expectedResultsFamilyWithCodepoints = { "👩...", // 4 "👩🏻...", "👩🏻...", // zero width joiner "👩🏻👨...", "👩🏻👨🏻...", "👩🏻👨🏻...", "👩🏻👨🏻👦..." }; String[] expectedResultsFamilyWithGrapheme = { "👩🏻👨🏻👦🏻👦🏻...", // 4 "👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼...", "👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼👩🏽👨🏽👦🏽👦🏽...", "👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼👩🏽👨🏽👦🏽👦🏽👩🏾👨🏾👦🏾👦🏾...", "👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼👩🏽👨🏽👦🏽👦🏽👩🏾👨🏾👦🏾👦🏾👩🏿👨🏿👦🏿👦🏿...", "👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼👩🏽👨🏽👦🏽👦🏽👩🏾👨🏾👦🏾👦🏾👩🏿👨🏿👦🏿👦🏿👩🏻👨🏻👦🏻👦🏻...", "👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼👩🏽👨🏽👦🏽👦🏽👩🏾👨🏾👦🏾👦🏾👩🏿👨🏿👦🏿👦🏿👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼..." // 10 }; @Test public void abberviate4ByteCharsShouldNotContainBrokenSurrogatePairs() { String abbreviateResult; for(var i = 4; i <= 10; i++) { abbreviateResult = StringUtils.abbreviate("🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊", i); String expectedFox = expectedResultsFox[i - 4]; assertEquals("The abbreviated String contains a broken surrogate pair", expectedFox, abbreviateResult); assertEquals("There are not enough codepoints in the result (2 for each fox, 3 for the dots)", expectedFox.codePointCount(0, expectedFox.length()), abbreviateResult.codePointCount(0,abbreviateResult.length())); } } @Test public void abbreviateGraphemeClusterShouldNotContainBrokenSurrogatePairs() { String abbreviateResult; for(var i = 4; i <= 10; i++) { abbreviateResult = StringUtils.abbreviate("👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼👩🏽👨🏽👦🏽👦🏽👩🏾👨🏾👦🏾👦🏾👩🏿👨🏿👦🏿👦🏿👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼👩🏽👨🏽👦🏽👦🏽👩🏾👨🏾👦🏾👦🏾👩🏿👨🏿👦🏿👦🏿", i); String expectedFamily = expectedResultsFamilyWithCodepoints[i - 4]; assertEquals("There are not enough codepoints in the result (2 for each person + 2 for each skin color + 1 for the zero width joiner, 3 for the dots)", expectedFamily.codePointCount(0, expectedFamily.length()), abbreviateResult.codePointCount(0,abbreviateResult.length())); } } @Test public void abbreviateGraphemeClusterMayHonorTheGraphemeCluster() { // if the abbreviate function honors the grapheme cluster it would cut after each one, not in the middle of them // but that could bring unwanted behavior. String abbreviateResult; for(var i = 4; i <= 10; i++) { abbreviateResult = StringUtils.abbreviate("👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼👩🏽👨🏽👦🏽👦🏽👩🏾👨🏾👦🏾👦🏾👩🏿👨🏿👦🏿👦🏿👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼👩🏽👨🏽👦🏽👦🏽👩🏾👨🏾👦🏾👦🏾👩🏿👨🏿👦🏿👦🏿", i); String expectedFamily = expectedResultsFamilyWithGrapheme[i - 4]; assertEquals("There are not enough codepoints in the result (11 for each family, 3 for the dots)", expectedFamily.codePointCount(0, expectedFamily.length()), abbreviateResult.codePointCount(0,abbreviateResult.length())); } } } On 2025/04/14 12:21:53 Gary Gregory wrote: > Hi Carsten, > > Could you provide a unit test with the expected behavior? The example > you gave has console output and assertions commented out, both of > which are undesirable. Instead of me guessing, I'd rather you manage > expectations and provide a failing/passing set of assertions. > > TY! > Gary > > > On Sun, Apr 13, 2025 at 7:45 PM Gary Gregory > <ga...@gmail.com<mailto:ga...@gmail.com>> wrote: > > > > I created https://issues.apache.org/jira/browse/LANG-1770 to track this > > report. > > > > Gary > > > > On Fri, Apr 11, 2025 at 10:15 AM Carsten Kirschner > > <ca...@corussoft.de.inva<mailto:ca...@corussoft.de.inva>lid> wrote: > > > > > > Hello, > > > > > > The current commons lang3 StringUtils.abbreviate (3.17.0) implementation > > > will destroy 4 byte emoji characters and larger grapheme clusters. I know > > > that handling grapheme correctly before java 20 is not possible, but at > > > least a codepoint aware solution with String.offsetByCodPoints could be > > > build. I wrote a small test to show the problem. > > > The zero width joiners in the family emoji are questionable for the > > > abbreviate, but there should never be a question mark for an invalid char > > > in the result as there is now. > > > > > > The problem is not so much the „doesn’t look nice“ aspect of the broken > > > emoji, but if that abbreviated string is passed to an XML Writer > > > (com.ctc.wstx.io.UTF8Writer in my case) it throws an exception on this > > > broken byte sequence. Like this: Caused by: java.io.IOException: Broken > > > surrogate pair: first char 0xd83c, second 0x2e; illegal combination > > > at > > > com.ctc.wstx.io.UTF8Writer._convertSurrogate(UTF8Writer.java:402) > > > ~[woodstox-core-7.0.0.jar:7.0.0] > > > > > > Thanks, > > > Carsten > > > > > > > > > > > > import org.apache.commons.lang3.StringUtils; > > > import org.junit.Test; > > > import static org.junit.Assert.*; > > > > > > public class AbbreviateTest { > > > > > > String[] expectedResultsFox = { > > > "🦊...", // 4 > > > "🦊🦊...", > > > "🦊🦊🦊...", > > > "🦊🦊🦊🦊...", > > > "🦊🦊🦊🦊🦊...", > > > "🦊🦊🦊🦊🦊🦊...", > > > "🦊🦊🦊🦊🦊🦊🦊...", // 10 > > > }; > > > > > > String[] expectedResultsFamilyWithCodepoints = { > > > "👩...", > > > "👩🏻...", > > > "👩🏻...", // zero width > > > joiner > > > "👩🏻👨...", > > > "👩🏻👨🏻...", > > > "👩🏻👨🏻...", > > > "👩🏻👨🏻👦..." > > > }; > > > > > > String[] expectedResultsFamilyWithGrapheme = { > > > "👩🏻👨🏻👦🏻👦🏻...", // 4 > > > > > > "👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼...", > > > > > > "👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼👩🏽👨🏽👦🏽👦🏽...", > > > > > > "👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼👩🏽👨🏽👦🏽👦🏽👩🏾👨🏾👦🏾👦🏾...", > > > > > > "👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼👩🏽👨🏽👦🏽👦🏽👩🏾👨🏾👦🏾👦🏾👩🏿👨🏿👦🏿👦🏿...", > > > > > > "👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼👩🏽👨🏽👦🏽👦🏽👩🏾👨🏾👦🏾👦🏾👩🏿👨🏿👦🏿👦🏿👩🏻👨🏻👦🏻👦🏻...", > > > > > > "👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼👩🏽👨🏽👦🏽👦🏽👩🏾👨🏾👦🏾👦🏾👩🏿👨🏿👦🏿👦🏿👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼..." > > > // 10 > > > }; > > > > > > @Test > > > public void abberviateTest() { > > > String abbreviateResult; > > > for(var i = 4; i <= 10; i++) { > > > abbreviateResult = > > > StringUtils.abbreviate("🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊", i); > > > > > > System.out.println(abbreviateResult); > > > > > > //assertEquals(expectedResultsFox[i - 4], abbreviateResult); > > > } > > > for(var i = 4; i <= 10; i++) { > > > abbreviateResult = > > > StringUtils.abbreviate("👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼👩🏽👨🏽👦🏽👦🏽👩🏾👨🏾👦🏾👦🏾👩🏿👨🏿👦🏿👦🏿👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼👩🏽👨🏽👦🏽👦🏽👩🏾👨🏾👦🏾👦🏾👩🏿👨🏿👦🏿👦🏿", > > > i); > > > > > > System.out.println(abbreviateResult); > > > > > > //assertEquals(expectedResultsFamilyWithCodepoints[i - 4], > > > abbreviateResult); > > > } > > > } > > > } > > > > > > >