I created https://issues.apache.org/jira/browse/LANG-1770 to track this report.

Gary

On Fri, Apr 11, 2025 at 10:15β€―AM Carsten Kirschner
<carsten.kirsch...@corussoft.de.invalid> wrote:
>
> Hello,
>
> The current commons lang3 StringUtils.abbreviate (3.17.0) implementation will 
> destroy 4 byte emoji characters and larger grapheme clusters. I know that 
> handling grapheme correctly before java 20 is not possible, but at least a 
> codepoint aware solution with String.offsetByCodPoints could be build. I 
> wrote a small test to show the problem.
> The zero width joiners in the family emoji are questionable for the 
> abbreviate, but there should never be a question mark for an invalid char in 
> the result as there is now.
>
> The problem is not so much the β€ždoesn’t look niceβ€œ aspect of the broken 
> emoji, but if that abbreviated string is passed to an XML Writer 
> (com.ctc.wstx.io.UTF8Writer in my case) it throws an exception on this broken 
> byte sequence. Like this: Caused by: java.io.IOException: Broken surrogate 
> pair: first char 0xd83c, second 0x2e; illegal combination
>                 at 
> com.ctc.wstx.io.UTF8Writer._convertSurrogate(UTF8Writer.java:402) 
> ~[woodstox-core-7.0.0.jar:7.0.0]
>
> Thanks,
> Carsten
>
>
>
> import org.apache.commons.lang3.StringUtils;
> import org.junit.Test;
> import static org.junit.Assert.*;
>
> public class AbbreviateTest {
>
>                 String[] expectedResultsFox = {
>                                                "🦊...", // 4
>                                                "🦊🦊...",
>                                                "🦊🦊🦊...",
>                                                "🦊🦊🦊🦊...",
>                                                "🦊🦊🦊🦊🦊...",
>                                                "🦊🦊🦊🦊🦊🦊...",
>                                                "🦊🦊🦊🦊🦊🦊🦊...", // 10
>                 };
>
>                 String[] expectedResultsFamilyWithCodepoints = {
>                                                "πŸ‘©...",
>                                                "πŸ‘©πŸ»...",
>                                                "πŸ‘©πŸ»β€...", // zero width joiner
>                                                "πŸ‘©πŸ»β€πŸ‘¨...",
>                                                "πŸ‘©πŸ»β€πŸ‘¨πŸ»...",
>                                                "πŸ‘©πŸ»β€πŸ‘¨πŸ»β€...",
>                                                "πŸ‘©πŸ»β€πŸ‘¨πŸ»β€πŸ‘¦..."
>                 };
>
>                 String[] expectedResultsFamilyWithGrapheme = {
>                                                "πŸ‘©πŸ»β€πŸ‘¨πŸ»β€πŸ‘¦πŸ»β€πŸ‘¦πŸ»...", // 4
>                                                "πŸ‘©πŸ»β€πŸ‘¨πŸ»β€πŸ‘¦πŸ»β€πŸ‘¦πŸ»πŸ‘©πŸΌβ€πŸ‘¨πŸΌβ€πŸ‘¦πŸΌβ€πŸ‘¦πŸΌ...",
>                                                
> "πŸ‘©πŸ»β€πŸ‘¨πŸ»β€πŸ‘¦πŸ»β€πŸ‘¦πŸ»πŸ‘©πŸΌβ€πŸ‘¨πŸΌβ€πŸ‘¦πŸΌβ€πŸ‘¦πŸΌπŸ‘©πŸ½β€πŸ‘¨πŸ½β€πŸ‘¦πŸ½β€πŸ‘¦πŸ½...",
>                                                
> "πŸ‘©πŸ»β€πŸ‘¨πŸ»β€πŸ‘¦πŸ»β€πŸ‘¦πŸ»πŸ‘©πŸΌβ€πŸ‘¨πŸΌβ€πŸ‘¦πŸΌβ€πŸ‘¦πŸΌπŸ‘©πŸ½β€πŸ‘¨πŸ½β€πŸ‘¦πŸ½β€πŸ‘¦πŸ½πŸ‘©πŸΎβ€πŸ‘¨πŸΎβ€πŸ‘¦πŸΎβ€πŸ‘¦πŸΎ...",
>                                                
> "πŸ‘©πŸ»β€πŸ‘¨πŸ»β€πŸ‘¦πŸ»β€πŸ‘¦πŸ»πŸ‘©πŸΌβ€πŸ‘¨πŸΌβ€πŸ‘¦πŸΌβ€πŸ‘¦πŸΌπŸ‘©πŸ½β€πŸ‘¨πŸ½β€πŸ‘¦πŸ½β€πŸ‘¦πŸ½πŸ‘©πŸΎβ€πŸ‘¨πŸΎβ€πŸ‘¦πŸΎβ€πŸ‘¦πŸΎπŸ‘©πŸΏβ€πŸ‘¨πŸΏβ€πŸ‘¦πŸΏβ€πŸ‘¦πŸΏ...",
>                                                
> "πŸ‘©πŸ»β€πŸ‘¨πŸ»β€πŸ‘¦πŸ»β€πŸ‘¦πŸ»πŸ‘©πŸΌβ€πŸ‘¨πŸΌβ€πŸ‘¦πŸΌβ€πŸ‘¦πŸΌπŸ‘©πŸ½β€πŸ‘¨πŸ½β€πŸ‘¦πŸ½β€πŸ‘¦πŸ½πŸ‘©πŸΎβ€πŸ‘¨πŸΎβ€πŸ‘¦πŸΎβ€πŸ‘¦πŸΎπŸ‘©πŸΏβ€πŸ‘¨πŸΏβ€πŸ‘¦πŸΏβ€πŸ‘¦πŸΏπŸ‘©πŸ»β€πŸ‘¨πŸ»β€πŸ‘¦πŸ»β€πŸ‘¦πŸ»...",
>                                                
> "πŸ‘©πŸ»β€πŸ‘¨πŸ»β€πŸ‘¦πŸ»β€πŸ‘¦πŸ»πŸ‘©πŸΌβ€πŸ‘¨πŸΌβ€πŸ‘¦πŸΌβ€πŸ‘¦πŸΌπŸ‘©πŸ½β€πŸ‘¨πŸ½β€πŸ‘¦πŸ½β€πŸ‘¦πŸ½πŸ‘©πŸΎβ€πŸ‘¨πŸΎβ€πŸ‘¦πŸΎβ€πŸ‘¦πŸΎπŸ‘©πŸΏβ€πŸ‘¨πŸΏβ€πŸ‘¦πŸΏβ€πŸ‘¦πŸΏπŸ‘©πŸ»β€πŸ‘¨πŸ»β€πŸ‘¦πŸ»β€πŸ‘¦πŸ»πŸ‘©πŸΌβ€πŸ‘¨πŸΌβ€πŸ‘¦πŸΌβ€πŸ‘¦πŸΌ..."
>  // 10
>                 };
>
>                 @Test
>                 public void abberviateTest() {
>                                String abbreviateResult;
>                                for(var i = 4; i <= 10; i++) {
>                                                abbreviateResult = 
> StringUtils.abbreviate("🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊", i);
>                                                
> System.out.println(abbreviateResult);
>                                                
> //assertEquals(expectedResultsFox[i - 4], abbreviateResult);
>                                }
>                                for(var i = 4; i <= 10; i++) {
>                                                abbreviateResult = 
> StringUtils.abbreviate("πŸ‘©πŸ»β€πŸ‘¨πŸ»β€πŸ‘¦πŸ»β€πŸ‘¦πŸ»πŸ‘©πŸΌβ€πŸ‘¨πŸΌβ€πŸ‘¦πŸΌβ€πŸ‘¦πŸΌπŸ‘©πŸ½β€πŸ‘¨πŸ½β€πŸ‘¦πŸ½β€πŸ‘¦πŸ½πŸ‘©πŸΎβ€πŸ‘¨πŸΎβ€πŸ‘¦πŸΎβ€πŸ‘¦πŸΎπŸ‘©πŸΏβ€πŸ‘¨πŸΏβ€πŸ‘¦πŸΏβ€πŸ‘¦πŸΏπŸ‘©πŸ»β€πŸ‘¨πŸ»β€πŸ‘¦πŸ»β€πŸ‘¦πŸ»πŸ‘©πŸΌβ€πŸ‘¨πŸΌβ€πŸ‘¦πŸΌβ€πŸ‘¦πŸΌπŸ‘©πŸ½β€πŸ‘¨πŸ½β€πŸ‘¦πŸ½β€πŸ‘¦πŸ½πŸ‘©πŸΎβ€πŸ‘¨πŸΎβ€πŸ‘¦πŸΎβ€πŸ‘¦πŸΎπŸ‘©πŸΏβ€πŸ‘¨πŸΏβ€πŸ‘¦πŸΏβ€πŸ‘¦πŸΏ",
>  i);
>                                                
> System.out.println(abbreviateResult);
>                                                
> //assertEquals(expectedResultsFamilyWithCodepoints[i - 4], abbreviateResult);
>                                }
>                 }
> }
>
>

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscr...@commons.apache.org
For additional commands, e-mail: dev-h...@commons.apache.org

Reply via email to