Hi,
not exactly to the subject, but here is a C++ function to check if a file name 
is "bad".
It returns a string with the reason, or nullptr if it's fine:



        const char* hasGoodName(const std::string& name) const {
                if (isspace(name().front())) {
                        static const char* reason = "spaceInFront";
                        return reason;
                }
                if (isspace(name().back())) {
                        static const char* reason = "spaceInBack";
                        return reason;
                }
                // only relevant if files go to dCache...
                if (name().size() > (255 - sizeof(".(get)()(checksums)"))) { // 
to long for dot command magic
                        static const char* reason = "tooLong";
                        return reason;
                }
                size_t n=0; // number of extension-bytes
                uint32_t code=0;
                for (auto c : name()) {
                        if (n==0) { // not an extension byte 
                                        if ((c & 0b1000'0000) == 0) { // ASCII
                                                if (! isprint(c)) {
                                                        static const char* 
reason = "nonPrintableAScii";
                                                        return reason;
                                                }
                                                if (c == '/') { // this must 
not be!
                                                        static const char* 
reason = "slashInName";
                                                        return reason; 
                                                }
                                        } else if ((c & 0b1110'0000) == 
0b1100'0000) { // one ext byte
                                                code =    c & 0b0001'1111;
                                                n=1;
                                        } else if ((c & 0b1111'0000) == 
0b1110'0000) { // two ext bytes
                                                code =    c & 0b0000'1111;
                                                n=2;
                                        } else if ((c & 0b1111'1000) == 
0b1111'0000) { // three ext bytes
                                                code =    c & 0b0000'0111;
                                                n=3;
                                        } else { // no UTF8 coding
                                                static const char* reason = 
"nonUTFStarter";
                                                return reason;
                                        }
                        } else { // this is an extension byte
                                if ((c & 0b1100'0000) == 0b1000'0000) { // and 
well-formed
                                        code = (code << 6) | (c & 0b0011'1111); 
// '
                                        n--;
                                        if (n==0) {// last extension byte seen: 
code finished
                                                if (0x0080 <= code && code <= 
0x009F) {
                                                        static const char* 
reason = "C1ControlCode";
                                                        return reason; 
                                                } else if ((0x200E <= code && 
code <= 0x200F) ||
                                                           (0x202A <= code && 
code <= 0x202E)){
                                                        static const char* 
reason = "directionMarks";
                                                        return reason; 
                                                } else if ((0xFFFE <= 
(code&0xFFFF) && (code&0xFFFF) <= 0xFFFF) ||
                                                           (0xFDD0 <= code && 
code <= 0xFDEF)) {
                                                        static const char* 
reason = "NonCharacter";
                                                        return reason; 
                                                } else if (0xE000 <= code && 
code <= 0xF8FF) {
                                                        static const char* 
reason = "privateUseUTF";
                                                        return reason; 
                                                } else if (0xf0000 <= code && 
code <= 0xfffff) {
                                                        static const char* 
reason = "privateUseAreaA";
                                                        return reason; 
                                                } else if (0x100000 <= code && 
code <= 0x10fffd) {
                                                        static const char* 
reason = "privateUseAreaB";
                                                        return reason; 
                                                } else if (0x110000 <= code) {
                                                        static const char* 
reason = "beyondLastPlane";
                                                        return reason;
                                                }
                                        }
                                } else { // ill-formed
                                        static const char* reason = 
"missingExtByte";
                                        return reason; 
                                }
                        }
                }
                return nullptr;
        }


----- Original Message -----
> From: "Jonathan Buzzard" <[email protected]>
> To: "gpfsug-discuss" <[email protected]>
> Sent: Friday, 7 July, 2023 14:37:26
> Subject: Re: [gpfsug-discuss] Special characters in filenames

> [Invalid UTF-8]
> 
> _______________________________________________
> gpfsug-discuss mailing list
> gpfsug-discuss at gpfsug.org
> http://gpfsug.org/mailman/listinfo/gpfsug-discuss_gpfsug.org

_______________________________________________
gpfsug-discuss mailing list
gpfsug-discuss at gpfsug.org
http://gpfsug.org/mailman/listinfo/gpfsug-discuss_gpfsug.org

Reply via email to