Hello Gary,

I updated the unit test, and removed the guessing part, I think.
This page shows nicely how the Family Grapheme is composed 
https://utf-8-visualizer.ardis.lu/?q=%F0%9F%91%A8%F0%9F%8F%BB%E2%80%8D%F0%9F%91%A9%F0%9F%8F%BB%E2%80%8D%F0%9F%91%A6%F0%9F%8F%BB%E2%80%8D%F0%9F%91%A6%F0%9F%8F%BB

Kind regards
Carsten


import org.apache.commons.lang3.StringUtils;
import org.junit.Test;
import static org.junit.Assert.*;

public class AbbreviateTest {

                String[] expectedResultsFox = {
                                               "🦊...", // 4
                                               "🦊🦊...",
                                               "🦊🦊🦊...",
                                               "🦊🦊🦊🦊...",
                                               "🦊🦊🦊🦊🦊...",
                                               "🦊🦊🦊🦊🦊🦊...",
                                               "🦊🦊🦊🦊🦊🦊🦊...", // 10
                };

                String[] expectedResultsFamilyWithCodepoints = {
                                               "👩...", // 4
                                               "👩🏻...",
                                               "👩🏻‍...", // zero width joiner
                                               "👩🏻‍👨...",
                                               "👩🏻‍👨🏻...",
                                               "👩🏻‍👨🏻‍...",
                                               "👩🏻‍👨🏻‍👦..."
                };

                String[] expectedResultsFamilyWithGrapheme = {
                                               "👩🏻‍👨🏻‍👦🏻‍👦🏻...", // 4
                                               "👩🏻‍👨🏻‍👦🏻‍👦🏻👩🏼‍👨🏼‍👦🏼‍👦🏼...",
                                               
"👩🏻‍👨🏻‍👦🏻‍👦🏻👩🏼‍👨🏼‍👦🏼‍👦🏼👩🏽‍👨🏽‍👦🏽‍👦🏽...",
                                               
"👩🏻‍👨🏻‍👦🏻‍👦🏻👩🏼‍👨🏼‍👦🏼‍👦🏼👩🏽‍👨🏽‍👦🏽‍👦🏽👩🏾‍👨🏾‍👦🏾‍👦🏾...",
                                               
"👩🏻‍👨🏻‍👦🏻‍👦🏻👩🏼‍👨🏼‍👦🏼‍👦🏼👩🏽‍👨🏽‍👦🏽‍👦🏽👩🏾‍👨🏾‍👦🏾‍👦🏾👩🏿‍👨🏿‍👦🏿‍👦🏿...",
                                               
"👩🏻‍👨🏻‍👦🏻‍👦🏻👩🏼‍👨🏼‍👦🏼‍👦🏼👩🏽‍👨🏽‍👦🏽‍👦🏽👩🏾‍👨🏾‍👦🏾‍👦🏾👩🏿‍👨🏿‍👦🏿‍👦🏿👩🏻‍👨🏻‍👦🏻‍👦🏻...",
                                               
"👩🏻‍👨🏻‍👦🏻‍👦🏻👩🏼‍👨🏼‍👦🏼‍👦🏼👩🏽‍👨🏽‍👦🏽‍👦🏽👩🏾‍👨🏾‍👦🏾‍👦🏾👩🏿‍👨🏿‍👦🏿‍👦🏿👩🏻‍👨🏻‍👦🏻‍👦🏻👩🏼‍👨🏼‍👦🏼‍👦🏼..."
 // 10
                };

                @Test
                public void 
abberviate4ByteCharsShouldNotContainBrokenSurrogatePairs() {
                               String abbreviateResult;
                               for(var i = 4; i <= 10; i++) {
                                               abbreviateResult = 
StringUtils.abbreviate("🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊", i);
                                               String expectedFox = 
expectedResultsFox[i - 4];
                                               assertEquals("The abbreviated 
String contains a broken surrogate pair", expectedFox, abbreviateResult);
                                               assertEquals("There are not 
enough codepoints in the result (2 for each fox, 3 for the dots)", 
expectedFox.codePointCount(0, expectedFox.length()), 
abbreviateResult.codePointCount(0,abbreviateResult.length()));
                               }
                }

                @Test
                public void 
abbreviateGraphemeClusterShouldNotContainBrokenSurrogatePairs() {
                               String abbreviateResult;
                               for(var i = 4; i <= 10; i++) {
                                               abbreviateResult = 
StringUtils.abbreviate("👩🏻‍👨🏻‍👦🏻‍👦🏻👩🏼‍👨🏼‍👦🏼‍👦🏼👩🏽‍👨🏽‍👦🏽‍👦🏽👩🏾‍👨🏾‍👦🏾‍👦🏾👩🏿‍👨🏿‍👦🏿‍👦🏿👩🏻‍👨🏻‍👦🏻‍👦🏻👩🏼‍👨🏼‍👦🏼‍👦🏼👩🏽‍👨🏽‍👦🏽‍👦🏽👩🏾‍👨🏾‍👦🏾‍👦🏾👩🏿‍👨🏿‍👦🏿‍👦🏿",
 i);
                                               String expectedFamily = 
expectedResultsFamilyWithCodepoints[i - 4];
                                               assertEquals("There are not 
enough codepoints in the result (2 for each person + 2 for each skin color + 1 
for the zero width joiner, 3 for the dots)", expectedFamily.codePointCount(0, 
expectedFamily.length()), 
abbreviateResult.codePointCount(0,abbreviateResult.length()));
                               }
                }

                @Test
                public void 
abbreviateGraphemeClusterMayHonorTheGraphemeCluster() {
                               // if the abbreviate function honors the 
grapheme cluster it would cut after each one, not in the middle of them
                               // but that could bring unwanted behavior.

                               String abbreviateResult;
                               for(var i = 4; i <= 10; i++) {
                                               abbreviateResult = 
StringUtils.abbreviate("👩🏻‍👨🏻‍👦🏻‍👦🏻👩🏼‍👨🏼‍👦🏼‍👦🏼👩🏽‍👨🏽‍👦🏽‍👦🏽👩🏾‍👨🏾‍👦🏾‍👦🏾👩🏿‍👨🏿‍👦🏿‍👦🏿👩🏻‍👨🏻‍👦🏻‍👦🏻👩🏼‍👨🏼‍👦🏼‍👦🏼👩🏽‍👨🏽‍👦🏽‍👦🏽👩🏾‍👨🏾‍👦🏾‍👦🏾👩🏿‍👨🏿‍👦🏿‍👦🏿",
 i);
                                               String expectedFamily = 
expectedResultsFamilyWithGrapheme[i - 4];
                                               assertEquals("There are not 
enough codepoints in the result (11 for each family, 3 for the dots)", 
expectedFamily.codePointCount(0, expectedFamily.length()), 
abbreviateResult.codePointCount(0,abbreviateResult.length()));
                               }
                }
}

On 2025/04/14 12:21:53 Gary Gregory wrote:
> Hi Carsten,
>
> Could you provide a unit test with the expected behavior? The example
> you gave has console output and assertions commented out, both of
> which are undesirable. Instead of me guessing, I'd rather you manage
> expectations and provide a failing/passing set of assertions.
>
> TY!
> Gary
>
>
> On Sun, Apr 13, 2025 at 7:45 PM Gary Gregory 
> <ga...@gmail.com<mailto:ga...@gmail.com>> wrote:
> >
> > I created https://issues.apache.org/jira/browse/LANG-1770 to track this 
> > report.
> >
> > Gary
> >
> > On Fri, Apr 11, 2025 at 10:15 AM Carsten Kirschner
> > <ca...@corussoft.de.inva<mailto:ca...@corussoft.de.inva>lid> wrote:
> > >
> > > Hello,
> > >
> > > The current commons lang3 StringUtils.abbreviate (3.17.0) implementation 
> > > will destroy 4 byte emoji characters and larger grapheme clusters. I know 
> > > that handling grapheme correctly before java 20 is not possible, but at 
> > > least a codepoint aware solution with String.offsetByCodPoints could be 
> > > build. I wrote a small test to show the problem.
> > > The zero width joiners in the family emoji are questionable for the 
> > > abbreviate, but there should never be a question mark for an invalid char 
> > > in the result as there is now.
> > >
> > > The problem is not so much the „doesn’t look nice“ aspect of the broken 
> > > emoji, but if that abbreviated string is passed to an XML Writer 
> > > (com.ctc.wstx.io.UTF8Writer in my case) it throws an exception on this 
> > > broken byte sequence. Like this: Caused by: java.io.IOException: Broken 
> > > surrogate pair: first char 0xd83c, second 0x2e; illegal combination
> > >                 at 
> > > com.ctc.wstx.io.UTF8Writer._convertSurrogate(UTF8Writer.java:402) 
> > > ~[woodstox-core-7.0.0.jar:7.0.0]
> > >
> > > Thanks,
> > > Carsten
> > >
> > >
> > >
> > > import org.apache.commons.lang3.StringUtils;
> > > import org.junit.Test;
> > > import static org.junit.Assert.*;
> > >
> > > public class AbbreviateTest {
> > >
> > >                 String[] expectedResultsFox = {
> > >                                                "🦊...", // 4
> > >                                                "🦊🦊...",
> > >                                                "🦊🦊🦊...",
> > >                                                "🦊🦊🦊🦊...",
> > >                                                "🦊🦊🦊🦊🦊...",
> > >                                                "🦊🦊🦊🦊🦊🦊...",
> > >                                                "🦊🦊🦊🦊🦊🦊🦊...", // 10
> > >                 };
> > >
> > >                 String[] expectedResultsFamilyWithCodepoints = {
> > >                                                "👩...",
> > >                                                "👩🏻...",
> > >                                                "👩🏻‍...", // zero width 
> > > joiner
> > >                                                "👩🏻‍👨...",
> > >                                                "👩🏻‍👨🏻...",
> > >                                                "👩🏻‍👨🏻‍...",
> > >                                                "👩🏻‍👨🏻‍👦..."
> > >                 };
> > >
> > >                 String[] expectedResultsFamilyWithGrapheme = {
> > >                                                "👩🏻‍👨🏻‍👦🏻‍👦🏻...", // 4
> > >                                                
> > > "👩🏻‍👨🏻‍👦🏻‍👦🏻👩🏼‍👨🏼‍👦🏼‍👦🏼...",
> > >                                                
> > > "👩🏻‍👨🏻‍👦🏻‍👦🏻👩🏼‍👨🏼‍👦🏼‍👦🏼👩🏽‍👨🏽‍👦🏽‍👦🏽...",
> > >                                                
> > > "👩🏻‍👨🏻‍👦🏻‍👦🏻👩🏼‍👨🏼‍👦🏼‍👦🏼👩🏽‍👨🏽‍👦🏽‍👦🏽👩🏾‍👨🏾‍👦🏾‍👦🏾...",
> > >                                                
> > > "👩🏻‍👨🏻‍👦🏻‍👦🏻👩🏼‍👨🏼‍👦🏼‍👦🏼👩🏽‍👨🏽‍👦🏽‍👦🏽👩🏾‍👨🏾‍👦🏾‍👦🏾👩🏿‍👨🏿‍👦🏿‍👦🏿...",
> > >                                                
> > > "👩🏻‍👨🏻‍👦🏻‍👦🏻👩🏼‍👨🏼‍👦🏼‍👦🏼👩🏽‍👨🏽‍👦🏽‍👦🏽👩🏾‍👨🏾‍👦🏾‍👦🏾👩🏿‍👨🏿‍👦🏿‍👦🏿👩🏻‍👨🏻‍👦🏻‍👦🏻...",
> > >                                                
> > > "👩🏻‍👨🏻‍👦🏻‍👦🏻👩🏼‍👨🏼‍👦🏼‍👦🏼👩🏽‍👨🏽‍👦🏽‍👦🏽👩🏾‍👨🏾‍👦🏾‍👦🏾👩🏿‍👨🏿‍👦🏿‍👦🏿👩🏻‍👨🏻‍👦🏻‍👦🏻👩🏼‍👨🏼‍👦🏼‍👦🏼..."
> > >  // 10
> > >                 };
> > >
> > >                 @Test
> > >                 public void abberviateTest() {
> > >                                String abbreviateResult;
> > >                                for(var i = 4; i <= 10; i++) {
> > >                                                abbreviateResult = 
> > > StringUtils.abbreviate("🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊", i);
> > >                                                
> > > System.out.println(abbreviateResult);
> > >                                                
> > > //assertEquals(expectedResultsFox[i - 4], abbreviateResult);
> > >                                }
> > >                                for(var i = 4; i <= 10; i++) {
> > >                                                abbreviateResult = 
> > > StringUtils.abbreviate("👩🏻‍👨🏻‍👦🏻‍👦🏻👩🏼‍👨🏼‍👦🏼‍👦🏼👩🏽‍👨🏽‍👦🏽‍👦🏽👩🏾‍👨🏾‍👦🏾‍👦🏾👩🏿‍👨🏿‍👦🏿‍👦🏿👩🏻‍👨🏻‍👦🏻‍👦🏻👩🏼‍👨🏼‍👦🏼‍👦🏼👩🏽‍👨🏽‍👦🏽‍👦🏽👩🏾‍👨🏾‍👦🏾‍👦🏾👩🏿‍👨🏿‍👦🏿‍👦🏿",
> > >  i);
> > >                                                
> > > System.out.println(abbreviateResult);
> > >                                                
> > > //assertEquals(expectedResultsFamilyWithCodepoints[i - 4], 
> > > abbreviateResult);
> > >                                }
> > >                 }
> > > }
> > >
> > >
>

Reply via email to