A few months ago I added a new paramater that indicates that portable
paths should be used. We were having problems storing Japanese file
names on Windows and then migrating the store to Linux.
It uses a technique similar to url-encode-path (base64 encode, then
url encode), but it only encodes non-latin paths and encodes each path
segment to avoid all files being contained in a single directory. It
also limits file length for files stored to the filesytem to 128
characters, using an MD5 hash of the filename to ensure uniqueness
when truncating.
I have always meant to contribute this code and it sounds like now may
be the time. I have my changes against the SLIDE_2_1_M1_RELEASE tag.
I attached the patches for the 2 modified files. I'm happy to create
a bug report and if that is preferred.
--Andy
On Wed, 24 Nov 2004 09:52:44 +0100, Jacob Lund <[EMAIL PROTECTED]> wrote:
> The url-encode-path was originally added to enable you to save files using
> characters not available in your operating system. I did some testing with
> russian filenames and it only worked when this parameter was set to true.
>
> /jacob
>
Index: AbstractTxFileStoreService.java
===================================================================
RCS file:
/home/cvspublic/jakarta-slide/src/stores/org/apache/slide/store/txfile/AbstractTxFileStoreService.java,v
retrieving revision 1.12
diff -w -u -r1.12 AbstractTxFileStoreService.java
--- AbstractTxFileStoreService.java 10 May 2004 08:23:26 -0000 1.12
+++ AbstractTxFileStoreService.java 24 Nov 2004 16:53:03 -0000
@@ -62,6 +62,7 @@
protected static final String TIMEOUT_PARAMETER = "timeout";
protected static final String URLENCODE_PATH_PARAMETER = "url-encode-path";
protected static final String DEBUG_MODE_PARAMETER = "debug";
+ protected static final String USE_PORTABLE_PATH_PARAMETER =
"use-portable-path";
protected FileResourceManager rm;
protected boolean started = false;
@@ -99,11 +100,19 @@
urlEncodePath = "true".equals(urlEncodePathString);
}
+ boolean usePortablePath = false;
+ String usePortablePathString = (String)
parameters.get(USE_PORTABLE_PATH_PARAMETER);
+ if (usePortablePathString != null) {
+ usePortablePath = "true".equals(usePortablePathString);
+ }
+
+ int pathEncoding = (urlEncodePath) ?
FileResourceManager.ENCODE_PATH_URL : (usePortablePath ?
FileResourceManager.ENCODE_PATH_PORTABLE :
FileResourceManager.ENCODE_PATH_NONE);
+
rm =
new FileResourceManager(
storeDir,
workDir,
- urlEncodePath,
+ pathEncoding,
new StoreLogger(getLogger(),
FileResourceManager.class.getName()),
debug);
Index: FileResourceManager.java
===================================================================
RCS file:
/home/cvspublic/jakarta-slide/src/stores/org/apache/slide/store/txfile/rm/impl/Attic/FileResourceManager.java,v
retrieving revision 1.11
diff -w -u -r1.11 FileResourceManager.java
--- FileResourceManager.java 10 May 2004 08:12:45 -0000 1.11
+++ FileResourceManager.java 24 Nov 2004 16:54:05 -0000
@@ -1,5 +1,5 @@
/*
- * $Header:
/home/cvspublic/jakarta-slide/src/stores/org/apache/slide/store/txfile/rm/impl/Attic/FileResourceManager.java,v
1.11 2004/05/10 08:12:45 ozeigermann Exp $
+ * $Header:
/home/cvspublic/jakarta-slide/src/stores/org/apache/slide/store/txfile/rm/impl/FileResourceManager.java,v
1.11 2004/05/10 08:12:45 ozeigermann Exp $
* $Revision: 1.11 $
* $Date: 2004/05/10 08:12:45 $
*
@@ -186,7 +186,6 @@
protected String workDir;
protected String storeDir;
- protected boolean urlEncodePath = false;
protected boolean cleanUp = true;
protected boolean dirty = false;
protected int operationMode = OPERATION_MODE_STOPPED;
@@ -199,6 +198,25 @@
protected List globalOpenResources;
protected LockManager lockManager;
+ public static final int ENCODE_PATH_NONE = 0;
+ public static final int ENCODE_PATH_URL = 1;
+ public static final int ENCODE_PATH_PORTABLE = 2;
+
+ public static final int MAX_PORTABLE_FILENAME = 128;
+
+ protected int encodeMode = ENCODE_PATH_NONE;
+
+ /**
+ * MD5 message digest provider.
+ */
+ protected static java.security.MessageDigest md5Helper;
+
+
+ /**
+ * The MD5 helper object for this class.
+ */
+ protected static final org.apache.util.MD5Encoder md5Encoder = new
org.apache.util.MD5Encoder();
+
/*
* --- ctor and general getter / setter methods ---
*
@@ -222,18 +240,50 @@
*
* @param storeDir directory where main data should go after commit
* @param workDir directory where transactions store temporary data
+ * @param encodeMode specifies the encoding to use for file paths
+ * written to the filesystem, either ENCODE_PATH_NONE,
+ * ENCODE_PATH_URL, or ENCODE_PATH_PORTABLE. This replaced the
+ * boolean urlEncodePath parameter.
+ * @param logger the logger to be used by this store
+ */
+ public FileResourceManager(String storeDir, String workDir, int
encodeMode, StoreLogger logger) {
+ this(storeDir, workDir, encodeMode, logger, false);
+ }
+
+ /**
+ * Creates a new resouce manager operation on the specified directories.
+ *
+ * @param storeDir directory where main data should go after commit
+ * @param workDir directory where transactions store temporary data
* @param urlEncodePath if set to <code>true</code> encodes all paths to
allow for any kind of characters
* @param logger the logger to be used by this store
* @param debug if set to <code>true</code> logs all locking information
to "transaction.log" for debugging inspection
*/
public FileResourceManager(String storeDir, String workDir, boolean
urlEncodePath, StoreLogger logger, boolean debug) {
+ this(storeDir, workDir, (urlEncodePath ? ENCODE_PATH_URL :
ENCODE_PATH_NONE), logger, debug);
+ }
+
+ /**
+ * Creates a new resouce manager operation on the specified directories.
+ *
+ * @param storeDir directory where main data should go after commit
+ * @param workDir directory where transactions store temporary data
+ * @param encodeMode specifies the encoding to use for file paths
+ * written to the filesystem, either ENCODE_PATH_NONE,
+ * ENCODE_PATH_URL, or ENCODE_PATH_PORTABLE. This replaced the
+ * boolean urlEncodePath parameter.
+ * @param logger the logger to be used by this store
+ * @param debug if set to <code>true</code> logs all locking information
to "transaction.log" for debugging inspection
+ */
+ public FileResourceManager(String storeDir, String workDir, int
encodeMode, StoreLogger logger, boolean debug) {
this.workDir = workDir;
this.storeDir = storeDir;
- this.urlEncodePath = urlEncodePath;
+ this.encodeMode = encodeMode;
this.logger = logger;
this.debug = debug;
}
+
/**
* Gets the store directory.
*
@@ -923,11 +973,128 @@
return (TransactionContext) globalTransactions.get(txId);
}
+ // ----------------------------------------------------------------------
+ // helper functions for portable encoding
+
+ private static final boolean isPortableChar(char c) {
+ return (Character.UnicodeBlock.of(c) ==
Character.UnicodeBlock.BASIC_LATIN);
+ }
+
+ private static final boolean isPortableString(String str) {
+ for (int i=0; i<str.length(); i++) {
+ if (!isPortableChar(str.charAt(i))) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private static final String makeStringPortable(String str) {
+ if (!isPortableString(str)) {
+ try {
+ // encode utf-8 chars
+ str = new String(Base64.encode(str.getBytes("UTF-8")), "ASCII");
+ // make sure / is encoded
+ str = URLEncoder.encode(str);
+ } catch (UnsupportedEncodingException e) {}
+ }
+
+ return str;
+ }
+
+ private static final String makeFilePortable(String file) {
+ String ret = file;
+
+ if (!isPortableString(file)) {
+ // we break on . and encode each part separately
+ ret = "";
+
+ int s = 0;
+ int e = s;
+ while ((e = file.indexOf('.', s)) != -1) {
+ ret += makeStringPortable( file.substring(s,e) ) + '.';
+ s = e+1;
+ }
+ ret += makeStringPortable( file.substring(s) );
+ }
+
+ // We ran into some issues with Windows and file lengths, so
+ // we cap the length at 128. Note that this does not affect
+ // the actual length of the filename as used from the webdav
+ // interface, because that will be stored in the metadata.
+ if (ret.length() >= MAX_PORTABLE_FILENAME) {
+ if (md5Helper == null) {
+ // initialize the MD5 MessageDigest
+ try {
+ md5Helper = java.security.MessageDigest.getInstance("MD5");
+ } catch (java.security.NoSuchAlgorithmException e) {}
+ }
+ if (md5Helper != null) {
+ try {
+ // truncate and make unique with MD5. note that using >=
+ // for the check, the only MAX_PORTABLE_FILENAME character
+ // names will be truncated files.
+ String md5 =
md5Encoder.encode(md5Helper.digest(ret.getBytes("ASCII")));
+ ret = ret.substring(0,MAX_PORTABLE_FILENAME-33) + "_" + md5;
+ } catch (UnsupportedEncodingException e) {}
+ }
+ }
+
+ return ret;
+ }
+
+ private static final String makePathPortable(String path) {
+ // we break on / and encode each part separately
+ String ret = "";
+
+ int s = 0;
+ int e = s;
+ while ((e = path.indexOf('/', s)) != -1) {
+ ret += makeFilePortable( path.substring(s,e) ) + '/';
+ s = e+1;
+ }
+ ret += makeFilePortable( path.substring(s) );
+
+ return ret;
+ }
+
+ // ----------------------------------------------------------------------
+
protected String assureLeadingSlash(Object pathObject) {
String path = "";
if (pathObject != null) {
path = pathObject.toString();
- if (urlEncodePath) {
+ switch (encodeMode) {
+ default:
+ case ENCODE_PATH_NONE:
+ break;
+ case ENCODE_PATH_PORTABLE:
+
+ // we added this mode because URL encoding seemed too
+ // aggressive, particularly for a large content store
+ // with many, deeply nested paths. that storage
+ // technique would result in many, many files in a
+ // single directory which would likely compromise
+ // performance and may lead to file limits in some
+ // filesystems.
+ //
+ // portable path encoding aims to be a portable
+ // solution (can be moved from Windows -> Linux ->
+ // MacOSX -> Windows Japanese -> etc. without
+ // modification) and as such, does not write any
+ // non-ascii characters for any path. it may be
+ // sufficient to avoid only non-latin characters, but
+ // to be safe, we encode any path containing non-ascii
+ // characters.
+ //
+ // note that this encoding technique breaks apart the
+ // path based on the File.separator and then
+ // recombines the encoded result to maintain the same
+ // directory structure.
+
+ path = makePathPortable(path);
+ break;
+ case ENCODE_PATH_URL:
try {
// XXX not allowed as for JDK1.4
// path = URLEncoder.encode(path,
"UTF-8");
@@ -942,6 +1109,7 @@
path = URLEncoder.encode(path);
} catch (UnsupportedEncodingException e) {
}
+ break;
}
if (path.length() > 0 && path.charAt(0) != '/' && path.charAt(0)
!= '\\') {
path = "/" + path;
---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]