[gpfsug-discuss] Special characters in filenames

Hannappel, Juergen juergen.hannappel at desy.de
Fri Jul 7 17:22:27 BST 2023


Hi,
not exactly to the subject, but here is a C++ function to check if a file name is "bad".
It returns a string with the reason, or nullptr if it's fine:



	const char* hasGoodName(const std::string& name) const {
		if (isspace(name().front())) {
			static const char* reason = "spaceInFront";
			return reason;
		}
		if (isspace(name().back())) {
			static const char* reason = "spaceInBack";
			return reason;
		}
                // only relevant if files go to dCache...
		if (name().size() > (255 - sizeof(".(get)()(checksums)"))) { // to long for dot command magic
			static const char* reason = "tooLong";
			return reason;
		}
		size_t n=0; // number of extension-bytes
		uint32_t code=0;
		for (auto c : name()) {
			if (n==0) { // not an extension byte 
					if ((c & 0b1000'0000) == 0) { // ASCII
						if (! isprint(c)) {
							static const char* reason = "nonPrintableAScii";
							return reason;
						}
						if (c == '/') { // this must not be!
							static const char* reason = "slashInName";
							return reason; 
						}
					} else if ((c & 0b1110'0000) == 0b1100'0000) { // one ext byte
						code =    c & 0b0001'1111;
						n=1;
					} else if ((c & 0b1111'0000) == 0b1110'0000) { // two ext bytes
						code =    c & 0b0000'1111;
						n=2;
					} else if ((c & 0b1111'1000) == 0b1111'0000) { // three ext bytes
						code =    c & 0b0000'0111;
						n=3;
					} else { // no UTF8 coding
						static const char* reason = "nonUTFStarter";
						return reason;
					}
			} else { // this is an extension byte
				if ((c & 0b1100'0000) == 0b1000'0000) { // and well-formed
					code = (code << 6) | (c & 0b0011'1111); // '
					n--;
					if (n==0) {// last extension byte seen: code finished
						if (0x0080 <= code && code <= 0x009F) {
							static const char* reason = "C1ControlCode";
							return reason; 
						} else if ((0x200E <= code && code <= 0x200F) ||
						           (0x202A <= code && code <= 0x202E)){
							static const char* reason = "directionMarks";
							return reason; 
						} else if ((0xFFFE <= (code&0xFFFF) && (code&0xFFFF) <= 0xFFFF) ||
						           (0xFDD0 <= code && code <= 0xFDEF)) {
							static const char* reason = "NonCharacter";
							return reason; 
						} else if (0xE000 <= code && code <= 0xF8FF) {
							static const char* reason = "privateUseUTF";
							return reason; 
						} else if (0xf0000 <= code && code <= 0xfffff) {
							static const char* reason = "privateUseAreaA";
							return reason; 
						} else if (0x100000 <= code && code <= 0x10fffd) {
							static const char* reason = "privateUseAreaB";
							return reason; 
						} else if (0x110000 <= code) {
							static const char* reason = "beyondLastPlane";
							return reason;
						}
					}
				} else { // ill-formed
					static const char* reason = "missingExtByte";
					return reason; 
				}
			}
		}
		return nullptr;
	}


----- Original Message -----
> From: "Jonathan Buzzard" <jonathan.buzzard at strath.ac.uk>
> To: "gpfsug-discuss" <gpfsug-discuss at gpfsug.org>
> Sent: Friday, 7 July, 2023 14:37:26
> Subject: Re: [gpfsug-discuss] Special characters in filenames

> [Invalid UTF-8]
> 
> _______________________________________________
> gpfsug-discuss mailing list
> gpfsug-discuss at gpfsug.org
> http://gpfsug.org/mailman/listinfo/gpfsug-discuss_gpfsug.org



More information about the gpfsug-discuss mailing list