garydgregory commented on code in PR #781: URL: https://github.com/apache/commons-io/pull/781#discussion_r2342461884
########## src/main/java/org/apache/commons/io/FileSystem.java: ########## @@ -504,30 +517,201 @@ public boolean supportsDriveLetter() { } /** - * Converts a candidate file name (without a path) like {@code "filename.ext"} or {@code "filename"} to a legal file - * name. Illegal characters in the candidate name are replaced by the {@code replacement} character. If the file - * name length exceeds {@link #getMaxFileNameLength()}, then the name is truncated to - * {@link #getMaxFileNameLength()}. + * Converts a candidate file name (without a path) to a legal file name. + * + * <p>Takes a file name like {@code "filename.ext"} or {@code "filename"} and:</p> + * <ul> + * <li>replaces illegal characters by the given replacement character</li> + * <li>truncates the name to {@link #getMaxFileNameLength()} if necessary</li> + * </ul> * * @param candidate - * a candidate file name (without a path) like {@code "filename.ext"} or {@code "filename"} + * A candidate file name (without a path) like {@code "filename.ext"} or {@code "filename"} * @param replacement * Illegal characters in the candidate name are replaced by this character * @return a String without illegal characters */ public String toLegalFileName(final String candidate, final char replacement) { + return toLegalFileName(candidate, replacement, Charset.defaultCharset()); + } + + /** + * Converts a candidate file name (without a path) to a legal file name. + * + * <p>Takes a file name like {@code "filename.ext"} or {@code "filename"} and:</p> + * <ul> + * <li>replaces illegal characters by the given replacement character</li> + * <li>truncates the name to {@link #getMaxFileNameLength()} if necessary</li> + * </ul> + * + * @param candidate + * A candidate file name (without a path) like {@code "filename.ext"} or {@code "filename"} + * @param replacement + * Illegal characters in the candidate name are replaced by this character + * @param charset + * The charset to use when the file name length is measured in bytes + * @return a String without illegal characters + * @since 2.21.0 + */ + public String toLegalFileName(final String candidate, final char replacement, final Charset charset) { + Objects.requireNonNull(candidate, "candidate"); + if (candidate.isEmpty()) { + throw new IllegalArgumentException("The candidate file name is empty"); + } if (isIllegalFileNameChar(replacement)) { // %s does not work properly with NUL throw new IllegalArgumentException(String.format("The replacement character '%s' cannot be one of the %s illegal characters: %s", replacement == '\0' ? "\\0" : replacement, name(), Arrays.toString(illegalFileNameChars))); } - final String truncated = candidate.length() > maxFileNameLength ? candidate.substring(0, maxFileNameLength) : candidate; + final CharSequence truncated = nameLengthStrategy.truncate(candidate, getMaxFileNameLength(), charset); final int[] array = truncated.chars().map(i -> isIllegalFileNameChar(i) ? replacement : i).toArray(); return new String(array, 0, array.length); } CharSequence trimExtension(final CharSequence cs) { - final int index = indexOf(cs, '.', 0); - return index < 0 ? cs : cs.subSequence(0, index); + final int index = indexOfFirstDot(cs); + // An initial dot is not an extension + return index < 1 ? cs : cs.subSequence(0, index); } + + /** + * Strategy for measuring and truncating file or path names in different units. + * Implementations measure length and can truncate to a specified limit. + */ + enum NameLengthStrategy { + /** Length measured as encoded bytes. */ + BYTES { + @Override + int getLength(final CharSequence value, final Charset charset) { + final CharsetEncoder enc = charset.newEncoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT); + try { + return enc.encode(CharBuffer.wrap(value)).remaining(); + } catch (CharacterCodingException e) { + // Unencodable, does not fit any byte limit. + return Integer.MAX_VALUE; + } + } + + @Override + CharSequence truncate(final CharSequence value, final int limit, final Charset charset) { Review Comment: I'm not sure this works for more complex examples in https://issues.apache.org/jira/projects/LANG/issues/LANG-1770 for strings like "π©π»βπ¨π»βπ¦π»βπ¦π»π©πΌβπ¨πΌβπ¦πΌβπ¦πΌπ©π½βπ¨π½βπ¦π½βπ¦π½π©πΎβπ¨πΎβπ¦πΎβπ¦πΎπ©πΏβπ¨πΏβπ¦πΏβπ¦πΏπ©π»βπ¨π»βπ¦π»βπ¦π»π©πΌβπ¨πΌβπ¦πΌβπ¦πΌπ©π½βπ¨π½βπ¦π½βπ¦π½π©πΎβπ¨πΎβπ¦πΎβπ¦πΎπ©πΏβπ¨πΏβπ¦πΏβπ¦πΏ" -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@commons.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org