Allow the FileDataStore to scale over several millions of files
---------------------------------------------------------------
Key: JCR-2682
URL: https://issues.apache.org/jira/browse/JCR-2682
Project: Jackrabbit Content Repository
Issue Type: Improvement
Components: jackrabbit-core
Affects Versions: 2.1.0, 1.6.2
Environment: Linux (Red-Hat)
Reporter: Vincent Larchet
in a project where we handle several millions of documents stored in JackRabbit
using the FileDataStore we encountered issues related to the file system istelf
(ext3) and with our backup tool.
The root cause is that having millions of files in the same file system is
quite hard, and with the way files are stored (using directories built upon the
fil content's hash), the backup tool has to scan the whole Table Of Content to
detect what has changed. In our case it takes approx. 2.5 hours to scan the 5+
millions files.
My idea was to be able to use several file systems mounted in the same
FileDataStore and declare some as read-only (thus the backup tool does not have
to scan them to find new files).
I made a working prototype by enhancing the FileDataStore to have a new level
at the top of the folders hierarchy, this folder changing with document
insertion date, the granularity being configured by a pattern (compatible with
SimpleDateFormat provided in the FileDataStore spring configuration)
Example:
* if we specify
<DataStore class="org.apache.jackrabbit.core.data.FileDataStore">
[...]
<param name="prefixDatePattern" value="yyyy-MM" />
</DataStore>
* then a folder ${FileDataStore.path}/2010-07/ will be created this month, this
folder containing the usual 3 level folder hierarchy built with content's hash
* this allows to mount a dedicated file system on this folder: In our case (we
do not modifiy existing data), next month (in August), this filesystem will be
re-mounted in read-only and the backup tool will just skip it most of the time
NOTE: implementation is 100% backward compatible, without changing the current
FileDataStore does not change the way they are persisted and it is possible to
change the config without having to extract/re-import all previous files (of
course, "old" documents will keep their "old" path on the hard-drive)
--------
seems that I can't upload files, so here is the patch for the trunk (only
FileDataStore is impacted) :
{code:title=FileDataStore.java}
--- FileDataStore.2.0-orig.java lun. juil. 19 15:50:13 2010
+++ FileDataStore.java lun. juil. 19 15:52:55 2010
@@ -26,8 +26,10 @@
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.sql.Timestamp;
+import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collections;
+import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
@@ -85,6 +87,11 @@
* Must be at least 3 characters.
*/
private static final String TMP = "tmp";
+
+ /**
+ * Separator used for differencing the date part from the hash in the
identifier.
+ */
+ private static final String DATE_SEP = "#";
/**
* The minimum modified date. If a file is accessed (read or write) with a
modified date
@@ -105,6 +112,12 @@
private String path;
/**
+ * The date pattern to use as a prefix for directories in the repository.
Set it to
+ * null or an empty string to disable this feature.
+ */
+ private String prefixDatePattern;
+
+ /**
* The minimum size of an object that should be stored in this data store.
*/
private int minRecordLength = DEFAULT_MIN_RECORD_LENGTH;
@@ -116,6 +129,13 @@
Collections.synchronizedMap(new WeakHashMap<DataIdentifier,
WeakReference<DataIdentifier>>());
/**
+ * Creates a uninitialized data store.
+ *
+ */
+ public FileDataStore() {
+ }
+
+ /**
* Initialized the data store.
* If the path is not set, <repository home>/repository/datastore is
used.
* This directory is automatically created if it does not yet exist.
@@ -199,7 +219,22 @@
} finally {
output.close();
}
- DataIdentifier identifier = new DataIdentifier(digest.digest());
+
+ // Convert the digest to an hexadecimal string...
+ String id = new DataIdentifier(digest.digest()).toString();
+
+ // ... and prepend it with the current date if prefixDatePattern
is set.
+ String prefixDatePattern = getPrefixDatePattern();
+ if (null != prefixDatePattern && !"".equals(prefixDatePattern)) {
+ try {
+ SimpleDateFormat sdf = new
SimpleDateFormat(prefixDatePattern);
+ String prefixDate = sdf.format(new Date());
+ id = prefixDate + DATE_SEP + id;
+ } catch (IllegalArgumentException e) {
+ log.warn("Date pattern ["+prefixDatePattern+"] is
incorrect. Ignoring the prefixDatePattern for FileDataStore.");
+ }
+ }
+ DataIdentifier identifier = new DataIdentifier(id);
File file;
synchronized (this) {
@@ -267,9 +302,16 @@
usesIdentifier(identifier);
String string = identifier.toString();
File file = directory;
- file = new File(file, string.substring(0, 2));
- file = new File(file, string.substring(2, 4));
- file = new File(file, string.substring(4, 6));
+ int indexDate = string.indexOf(DATE_SEP);
+ if (indexDate > -1) {
+ file = new File(file, string.substring(0, indexDate));
+ indexDate++; // To ignore the date separator
+ } else {
+ indexDate = 0;
+ }
+ file = new File(file, string.substring(indexDate, indexDate+2));
+ file = new File(file, string.substring(indexDate+2, indexDate+4));
+ file = new File(file, string.substring(indexDate+4, indexDate+6));
return new File(file, string);
}
@@ -378,6 +420,28 @@
this.path = directoryName;
}
+ /**
+ * Get the date pattern to use as a prefix for the data store repository.
+ *
+ * @return the date pattern
+ */
+ public String getPrefixDatePattern() {
+ return prefixDatePattern;
+ }
+
+ /**
+ * Set the date pattern to use as a prefix for the data store repository.
+ *
+ * @param prefixDatePattern the date pattern
+ */
+ public void setPrefixDatePattern(String prefixDatePattern) {
+ // We want to prevent the inclusion of the DATE_SEP character
in the date prefix
+ if (prefixDatePattern.indexOf(DATE_SEP) > -1) {
+ log.warn("Do not use the character ["+DATE_SEP+"] in
your date pattern for FileDataStore!");
+ }
+ this.prefixDatePattern = prefixDatePattern.replaceAll(DATE_SEP,
"");
+ }
+
public int getMinRecordLength() {
return minRecordLength;
}
{code}
--
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.