Greg,

It looks like you've been dealing with PDF strings containing garbage
characters.  My guess is that you're seeing the private use characters that
PDFKit inserts, which are certainly going to confuse AGRegex.  I use the
following category on NSMutableString for removing them, and you might want
to do similar cleanup (I use this in BD for cleaning Skim notes).

hth,
Adam

// NS and CF character sets won't find these, due to the way CFString
handles surrogate pairs.  The surrogate pair inlines were borrowed from
CFCharacterSetPriv.h in CF-lite-476.13.
static inline bool __SKIsSurrogateHighCharacter(const UniChar character) {
    return ((character >= 0xD800UL) && (character <= 0xDBFFUL) ? true :
false);
}

static inline bool __SKIsSurrogateLowCharacter(const UniChar character) {
    return ((character >= 0xDC00UL) && (character <= 0xDFFFUL) ? true :
false);
}

static inline UTF32Char __SKGetLongCharacterForSurrogatePair(const UniChar
surrogateHigh, const UniChar surrogateLow) {
    return ((surrogateHigh - 0xD800UL) << 10) + (surrogateLow - 0xDC00UL) +
0x0010000UL;
}

static inline bool __SKIsPrivateUseCharacter(const UTF32Char ch)
{
    return ((ch >= 0xE000UL && ch <= 0xF8FFUL) ||    /* private use area */
            (ch >= 0xF0000UL && ch <= 0xFFFFFUL) ||  /* supplementary
private use A */
            (ch >= 0x100000UL && ch <= 0x10FFFFUL)); /* supplementary
private use B */
}

#define SURROGATE_START 0xD800
#define SURROGATE_END 0xDFFF

// Remove anything in the private use planes, and/or malformed surrogate
pair sequences rdar://problem/6273932
static void removeAliens(NSMutableString *string)
{    
    // make a (mutable) copy only if needed
    CFMutableStringRef theString = (void *)string;
    
    CFStringInlineBuffer inlineBuffer;
    CFIndex length = CFStringGetLength(theString);
    
    // use the current mutable string with the inline buffer, but make a new
mutable copy if needed
    CFStringInitInlineBuffer(theString, &inlineBuffer, CFRangeMake(0,
length));
    UniChar ch;
    
#define LAZY_COPY 
do{if((void*)string==theString){theString=CFStringCreateMutableCopy(CFGetAll
ocator(theString), 0, theString);}} while(0)
    
    // idx is current index into the inline buffer, and delIdx is current
index in the mutable string
    CFIndex idx = 0, delIdx = 0;
    while(idx < length){
        ch = CFStringGetCharacterFromInlineBuffer(&inlineBuffer, idx);
        if (__SKIsPrivateUseCharacter(ch)) {
            LAZY_COPY;
            CFStringDelete(theString, CFRangeMake(delIdx, 1));
        } else if ((ch >= SURROGATE_START) && (ch <= SURROGATE_END)) {
            
            if ((idx + 1) < length) {
                
                UniChar highChar = ch;
                UniChar lowChar =
CFStringGetCharacterFromInlineBuffer(&inlineBuffer, idx + 1);
                UTF32Char longChar =
__SKGetLongCharacterForSurrogatePair(highChar, lowChar);
                // if we only have half of a surrogate pair, delete the
offending character
                if (__SKIsSurrogateLowCharacter(lowChar) == false ||
__SKIsSurrogateHighCharacter(highChar) == false) {
                    LAZY_COPY;
                    CFStringDelete(theString, CFRangeMake(delIdx, 1));
                    // only deleted a single char, so don't need to adjust
idx
                } else if (__SKIsPrivateUseCharacter(longChar)) {
                    LAZY_COPY;
                    // remove the pair; can't display private use characters
                    CFStringDelete(theString, CFRangeMake(delIdx, 2));
                    // adjust since we removed two characters...
                    idx++;
                } else {
                    // valid surrogate pair, so we'll leave it alone
                    delIdx += 2;
                    idx++;
                }
                
            } else {
                // insufficient length for this to be a valid sequence, so
it's only half of a surrogate pair
                LAZY_COPY;
                CFStringDelete(theString, CFRangeMake(delIdx, 1));
            }
            
        } else {
            // keep track of our index in the copy and the original
            delIdx++;
        }
        idx++;
    }
    
    // if the local var is non-NULL and is not the parameter, then it's a
copy
    if (theString && theString != (void *)string) {
        [string setString:(NSMutableString *)theString];
        CFRelease(theString);
    }
}

- (void)removeAliens;
{
    removeAliens(self);
}


------------------------------------------------------------------------------
This SF.net email is sponsored by:
SourcForge Community
SourceForge wants to tell your story.
http://p.sf.net/sfu/sf-spreadtheword
_______________________________________________
Bibdesk-develop mailing list
Bibdesk-develop@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/bibdesk-develop

Reply via email to