Hi, not exactly to the subject, but here is a C++ function to check if a file name is "bad". It returns a string with the reason, or nullptr if it's fine:
const char* hasGoodName(const std::string& name) const {
if (isspace(name().front())) {
static const char* reason = "spaceInFront";
return reason;
}
if (isspace(name().back())) {
static const char* reason = "spaceInBack";
return reason;
}
// only relevant if files go to dCache...
if (name().size() > (255 - sizeof(".(get)()(checksums)"))) { //
to long for dot command magic
static const char* reason = "tooLong";
return reason;
}
size_t n=0; // number of extension-bytes
uint32_t code=0;
for (auto c : name()) {
if (n==0) { // not an extension byte
if ((c & 0b1000'0000) == 0) { // ASCII
if (! isprint(c)) {
static const char*
reason = "nonPrintableAScii";
return reason;
}
if (c == '/') { // this must
not be!
static const char*
reason = "slashInName";
return reason;
}
} else if ((c & 0b1110'0000) ==
0b1100'0000) { // one ext byte
code = c & 0b0001'1111;
n=1;
} else if ((c & 0b1111'0000) ==
0b1110'0000) { // two ext bytes
code = c & 0b0000'1111;
n=2;
} else if ((c & 0b1111'1000) ==
0b1111'0000) { // three ext bytes
code = c & 0b0000'0111;
n=3;
} else { // no UTF8 coding
static const char* reason =
"nonUTFStarter";
return reason;
}
} else { // this is an extension byte
if ((c & 0b1100'0000) == 0b1000'0000) { // and
well-formed
code = (code << 6) | (c & 0b0011'1111);
// '
n--;
if (n==0) {// last extension byte seen:
code finished
if (0x0080 <= code && code <=
0x009F) {
static const char*
reason = "C1ControlCode";
return reason;
} else if ((0x200E <= code &&
code <= 0x200F) ||
(0x202A <= code &&
code <= 0x202E)){
static const char*
reason = "directionMarks";
return reason;
} else if ((0xFFFE <=
(code&0xFFFF) && (code&0xFFFF) <= 0xFFFF) ||
(0xFDD0 <= code &&
code <= 0xFDEF)) {
static const char*
reason = "NonCharacter";
return reason;
} else if (0xE000 <= code &&
code <= 0xF8FF) {
static const char*
reason = "privateUseUTF";
return reason;
} else if (0xf0000 <= code &&
code <= 0xfffff) {
static const char*
reason = "privateUseAreaA";
return reason;
} else if (0x100000 <= code &&
code <= 0x10fffd) {
static const char*
reason = "privateUseAreaB";
return reason;
} else if (0x110000 <= code) {
static const char*
reason = "beyondLastPlane";
return reason;
}
}
} else { // ill-formed
static const char* reason =
"missingExtByte";
return reason;
}
}
}
return nullptr;
}
----- Original Message -----
> From: "Jonathan Buzzard" <[email protected]>
> To: "gpfsug-discuss" <[email protected]>
> Sent: Friday, 7 July, 2023 14:37:26
> Subject: Re: [gpfsug-discuss] Special characters in filenames
> [Invalid UTF-8]
>
> _______________________________________________
> gpfsug-discuss mailing list
> gpfsug-discuss at gpfsug.org
> http://gpfsug.org/mailman/listinfo/gpfsug-discuss_gpfsug.org
_______________________________________________
gpfsug-discuss mailing list
gpfsug-discuss at gpfsug.org
http://gpfsug.org/mailman/listinfo/gpfsug-discuss_gpfsug.org
