HDFS-6864. Archival Storage: add user documentation. Contributed by Tsz Wo 
Nicholas Sze.


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/b014e83b
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/b014e83b
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/b014e83b

Branch: refs/heads/HDFS-6581
Commit: b014e83bc5899ec135b1e7a54ca1902c970047a5
Parents: 91f6dde
Author: Jing Zhao <j...@hortonworks.com>
Authored: Wed Sep 17 09:40:17 2014 -0700
Committer: Jing Zhao <j...@hortonworks.com>
Committed: Wed Sep 17 09:40:17 2014 -0700

----------------------------------------------------------------------
 hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt     |   2 +
 .../hadoop/hdfs/DistributedFileSystem.java      |   6 +
 .../apache/hadoop/hdfs/server/mover/Mover.java  |   6 +-
 .../src/site/apt/ArchivalStorage.apt.vm         | 302 +++++++++++++++++++
 .../src/site/apt/HDFSCommands.apt.vm            |  43 ++-
 hadoop-project/src/site/site.xml                |   1 +
 6 files changed, 349 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/b014e83b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
----------------------------------------------------------------------
diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt 
b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
index e859ca2..7a9c723 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
+++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
@@ -71,6 +71,8 @@ HDFS-6584: Archival Storage
 
     HDFS-7072. Fix TestBlockManager and TestStorageMover.  (jing9 via szetszwo)
 
+    HDFS-6864. Archival Storage: add user documentation. (szetszwo via jing9)
+
 Trunk (Unreleased)
 
   INCOMPATIBLE CHANGES

http://git-wip-us.apache.org/repos/asf/hadoop/blob/b014e83b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java
----------------------------------------------------------------------
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java
index 1c60e7b..6bce8b9 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java
@@ -472,6 +472,12 @@ public class DistributedFileSystem extends FileSystem {
     }.resolve(this, absF);
   }
 
+  /**
+   * Set the source path to the specified storage policy.
+   *
+   * @param src The source path referring to either a directory or a file.
+   * @param policyName The name of the storage policy.
+   */
   public void setStoragePolicy(final Path src, final String policyName)
       throws IOException {
     statistics.incrementWriteOps(1);

http://git-wip-us.apache.org/repos/asf/hadoop/blob/b014e83b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/mover/Mover.java
----------------------------------------------------------------------
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/mover/Mover.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/mover/Mover.java
index 0812c03..f1837ae 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/mover/Mover.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/mover/Mover.java
@@ -498,9 +498,9 @@ public class Mover {
 
   static class Cli extends Configured implements Tool {
     private static final String USAGE = "Usage: java "
-        + Mover.class.getSimpleName()
-        + " [-p <space separated files/dirs> specify a list of files/dirs to 
migrate]"
-        + " [-f <local file name>            specify a local file containing 
files/dirs to migrate]";
+        + Mover.class.getSimpleName() + " [-p <files/dirs> | -f <local file>]"
+        + "\n\t-p <files/dirs>\ta space separated list of HDFS files/dirs to 
migrate."
+        + "\n\t-f <local file>\ta local file containing a list of HDFS 
files/dirs to migrate.";
 
     private static Options buildCliOptions() {
       Options opts = new Options();

http://git-wip-us.apache.org/repos/asf/hadoop/blob/b014e83b/hadoop-hdfs-project/hadoop-hdfs/src/site/apt/ArchivalStorage.apt.vm
----------------------------------------------------------------------
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/site/apt/ArchivalStorage.apt.vm 
b/hadoop-hdfs-project/hadoop-hdfs/src/site/apt/ArchivalStorage.apt.vm
new file mode 100644
index 0000000..5301d52
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/site/apt/ArchivalStorage.apt.vm
@@ -0,0 +1,302 @@
+~~ Licensed under the Apache License, Version 2.0 (the "License");
+~~ you may not use this file except in compliance with the License.
+~~ You may obtain a copy of the License at
+~~
+~~   http://www.apache.org/licenses/LICENSE-2.0
+~~
+~~ Unless required by applicable law or agreed to in writing, software
+~~ distributed under the License is distributed on an "AS IS" BASIS,
+~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+~~ See the License for the specific language governing permissions and
+~~ limitations under the License. See accompanying LICENSE file.
+
+  ---
+  HDFS Archival Storage
+  ---
+  ---
+  ${maven.build.timestamp}
+
+HDFS Archival Storage
+
+%{toc|section=1|fromDepth=0}
+
+* {Introduction}
+
+  <Archival Storage> is a solution to decouple growing storage capacity from 
compute capacity.
+  Nodes with higher density and less expensive storage with low compute power 
are becoming available
+  and can be used as cold storage in the clusters.
+  Based on policy the data from hot can be moved to the cold.
+  Adding more nodes to the cold storage can grow the storage independent of 
the compute capacity
+  in the cluster.
+
+* {Storage Types and Storage Policies}
+
+** {Storage Types: DISK, SSD and ARCHIVE}
+
+  The first phase of
+  {{{https://issues.apache.org/jira/browse/HDFS-2832}Heterogeneous Storage 
(HDFS-2832)}}
+  changed datanode storage model from a single storage,
+  which may correspond to multiple physical storage medias,
+  to a collection of storages with each storage corresponding to a physical 
storage media.
+  It also added the notion of storage types, DISK and SSD,
+  where DISK is the default storage type.
+
+  A new storage type <ARCHIVE>,
+  which has high storage density (petabyte of storage) but little compute 
power,
+  is added for supporting archival storage.
+
+** {Storage Policies: Hot, Warm and Cold}
+
+  A new concept of storage policies is introduced in order to allow files to 
be stored
+  in different storage types according to the storage policy.
+
+  We have the following storage policies:
+
+  * <<Hot>> - for both storage and compute.
+              The data that is popular and still being used for processing 
will stay in this policy.
+              When a block is hot, all replicas are stored in DISK.
+
+  * <<Cold>> - only for storage with limited compute.
+               The data that is no longer being used, or data that needs to be 
archived is moved
+               from hot storage to cold storage.
+               When a block is cold, all replicas are stored in ARCHIVE.
+
+  * <<Warm>> - partially hot and partially cold.
+               When a block is warm, some of its replicas are stored in DISK
+               and the remaining replicas are stored in ARCHIVE.
+
+  []
+
+  More formally, a storage policy consists of the following fields:
+
+  [[1]] Policy ID
+
+  [[2]] Policy name
+
+  [[3]] A list of storage types for block placement
+
+  [[4]] A list of fallback storage types for file creation
+
+  [[5]] A list of fallback storage types for replication
+
+  []
+
+  When there is enough space,
+  block replicas are stored according to the storage type list specified in #3.
+  When some of the storage types in list #3 are running out of space,
+  the fallback storage type lists specified in #4 and #5 are used
+  to replace the out-of-space storage types for file creation and replication, 
respectively.
+
+  The following is a typical storage policy table.
+
+*--------+---------------+-------------------------+-----------------------+-----------------------+
+| <<Policy>> | <<Policy>>| <<Block Placement>>     | <<Fallback storages>> | 
<<Fallback storages>> |
+| <<ID>>     | <<Name>>  | <<(n\ replicas)>>      | <<for creation>>      | 
<<for replication>>   |
+*--------+---------------+-------------------------+-----------------------+-----------------------+
+| 12     | Hot (default) | DISK: <n>               | \<none\>              | 
ARCHIVE               |
+*--------+---------------+-------------------------+-----------------------+-----------------------+
+| 8      | Warm          | DISK: 1, ARCHIVE: <n>-1 | ARCHIVE, DISK         | 
ARCHIVE, DISK         |
+*--------+---------------+-------------------------+-----------------------+-----------------------+
+| 4      | Cold          | ARCHIVE: <n>            | \<none\>              | 
\<none\>              |
+*--------+---------------+-------------------------+-----------------------+-----------------------+
+
+  Note that cluster administrators may change the storage policy table
+  according to the characteristic of the cluster.
+  For example, in order to prevent losing archival data,
+  administrators may want to use DISK as fallback storage for replication in 
the Cold policy.
+  A drawback of such setting is that the DISK storages could be filled up with 
archival data.
+  As a result, the entire cluster may become full and cannot serve hot data 
anymore.
+
+** {Configurations}
+
+*** {Setting The List of All Storage Policies}
+
+  * <<dfs.block.storage.policies>>
+    - a list of block storage policy names and IDs.
+    The syntax is
+
+      NAME_1:ID_1, NAME_2:ID_2, ..., NAME_<n>:ID_<n>
+
+    where ID is an integer in the closed range [1,15] and NAME is case 
insensitive.
+    The first element is the <default policy>.  Empty list is not allowed.
+
+    The default value is shown below.
+
++------------------------------------------+
+<property>
+  <name>dfs.block.storage.policies</name>
+  <value>HOT:12, WARM:8, COLD:4</value>
+</property>
++------------------------------------------+
+
+  []
+
+*** {Setting Storage Policy Details}
+
+  The following configuration properties are for setting the details of each 
storage policy,
+  where <<<\<ID\>>>> is the actual policy ID.
+
+  * <<dfs.block.storage.policy.\<ID\>>>
+    - a list of storage types for storing the block replicas.
+    The syntax is
+
+      STORAGE_TYPE_1, STORAGE_TYPE_2, ..., STORAGE_TYPE_<n>
+  
+    When creating a block, the <i>-th replica is stored using <i>-th storage 
type
+    for <i> less than or equal to <n>, and
+    the <j>-th replica is stored using <n>-th storage type for <j> greater 
than <n>.
+
+    Empty list is not allowed.
+
+    Examples:
+
++------------------------------------------+
+DISK          : all replicas stored using DISK.
+DISK, ARCHIVE : the first replica is stored using DISK and all the
+                remaining replicas are stored using ARCHIVE.
++------------------------------------------+
+
+  * <<dfs.block.storage.policy.creation-fallback.\<ID\>>>
+    - a list of storage types for creation fallback storage.
+    The syntax is
+
+      STORAGE_TYPE_1, STORAGE_TYPE_2, ..., STORAGE_TYPE_n
+  
+    When creating a block, if a particular storage type specified in the policy
+    is unavailable, the fallback STORAGE_TYPE_1 is used.  Further, if
+    STORAGE_TYPE_<i> is also unavailable, the fallback STORAGE_TYPE_<(i+1)> is 
used.
+    In case all fallback storages are unavailable, the block will be created
+    with number of replicas less than the specified replication factor.
+
+    An empty list indicates that there is no fallback storage.
+
+  * <<dfs.block.storage.policy.replication-fallback.\<ID\>>>
+    - a list of storage types for replication fallback storage.
+    The usage of this configuration property is similar to
+    <<<dfs.block.storage.policy.creation-fallback.\<ID\>>>>
+    except that it takes effect on replication but not block creation.
+
+  []
+
+  The following are the default configuration values for Hot, Warm and Cold 
storage policies.
+
+  * Block Storage Policy <<HOT:12>>
+
++------------------------------------------+
+<property>
+  <name>dfs.block.storage.policy.12</name>
+  <value>DISK</value>
+</property>
+<property>
+  <name>dfs.block.storage.policy.creation-fallback.12</name>
+  <value></value>
+</property>
+<property>
+  <name>dfs.block.storage.policy.replication-fallback.12</name>
+  <value>ARCHIVE</value>
+</property>
++------------------------------------------+
+
+  * Block Storage Policy <<WARM:8>>
+
++------------------------------------------+
+<property>
+  <name>dfs.block.storage.policy.8</name>
+  <value>DISK, ARCHIVE</value>
+</property>
+<property>
+  <name>dfs.block.storage.policy.creation-fallback.8</name>
+  <value>DISK, ARCHIVE</value>
+</property>
+<property>
+  <name>dfs.block.storage.policy.replication-fallback.8</name>
+  <value>DISK, ARCHIVE</value>
+</property>
++------------------------------------------+
+
+  * Block Storage Policy <<COLD:4>>
+
++------------------------------------------+
+<property>
+  <name>dfs.block.storage.policy.4</name>
+  <value>ARCHIVE</value>
+</property>
+<property>
+  <name>dfs.block.storage.policy.creation-fallback.4</name>
+  <value></value>
+</property>
+<property>
+  <name>dfs.block.storage.policy.replication-fallback.4</name>
+  <value></value>
+</property>
++------------------------------------------+
+
+  []
+
+* {Mover - A New Data Migration Tool}
+
+  A new data migration tool is added for archiving data.
+  The tool is similar to Balancer.
+  It periodically scans the files in HDFS to check if the block placement 
satisfies the storage policy.
+  For the blocks violating the storage policy,
+  it moves the replicas to a different storage type
+  in order to fulfill the storage policy requirement.
+
+  * Command:
+
++------------------------------------------+
+hdfs mover [-p <files/dirs> | -f <local file name>]
++------------------------------------------+
+
+  * Arguments:
+
+*-------------------------+--------------------------------------------------------+
+| <<<-p \<files/dirs\>>>> | Specify a space separated list of HDFS files/dirs 
to migrate.
+*-------------------------+--------------------------------------------------------+
+| <<<-f \<local file\>>>> | Specify a local file containing a list of HDFS 
files/dirs to migrate.
+*-------------------------+--------------------------------------------------------+
+
+  Note that, when both -p and -f options are omitted, the default path is the 
root directory.
+
+  []
+
+
+* {<<<DFSAdmin>>> Commands}
+
+** {Set Storage Policy}
+
+  Set a storage policy to a file or a directory.
+
+  * Command:
+
++------------------------------------------+
+hdfs dfsadmin -setStoragePolicy <path> <policyName>
++------------------------------------------+
+
+  * Arguments:
+
+*----------------------+-----------------------------------------------------+
+| <<<\<path\>>>>       | The path referring to either a directory or a file. |
+*----------------------+-----------------------------------------------------+
+| <<<\<policyName\>>>> | The name of the storage policy.                     |
+*----------------------+-----------------------------------------------------+
+
+  []
+
+** {Get Storage Policy}
+
+  Get the storage policy of a file or a directory.
+
+  * Command:
+
++------------------------------------------+
+hdfs dfsadmin -getStoragePolicy <path>
++------------------------------------------+
+
+  * Arguments:
+
+*----------------------+-----------------------------------------------------+
+| <<<\<path\>>>>       | The path referring to either a directory or a file. |
+*----------------------+-----------------------------------------------------+
+
+  []

http://git-wip-us.apache.org/repos/asf/hadoop/blob/b014e83b/hadoop-hdfs-project/hadoop-hdfs/src/site/apt/HDFSCommands.apt.vm
----------------------------------------------------------------------
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/site/apt/HDFSCommands.apt.vm 
b/hadoop-hdfs-project/hadoop-hdfs/src/site/apt/HDFSCommands.apt.vm
index 6eb60f0..170f352 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/site/apt/HDFSCommands.apt.vm
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/site/apt/HDFSCommands.apt.vm
@@ -147,18 +147,19 @@ HDFS Commands Guide
 *-----------------+-----------------------------------------------------------+
 | -regular        | Normal datanode startup (default).
 *-----------------+-----------------------------------------------------------+
-| -rollback       | Rollsback the datanode to the previous version. This should
+| -rollback       | Rollback the datanode to the previous version. This should
 |                 | be used after stopping the datanode and distributing the
 |                 | old hadoop version.
 *-----------------+-----------------------------------------------------------+
-| -rollingupgrade rollback | Rollsback a rolling upgrade operation.
+| -rollingupgrade rollback | Rollback a rolling upgrade operation.
 *-----------------+-----------------------------------------------------------+
 
 ** <<<dfsadmin>>>
 
    Runs a HDFS dfsadmin client.
 
-   Usage: <<<hdfs dfsadmin [GENERIC_OPTIONS]
++------------------------------------------+
+   Usage: hdfs dfsadmin [GENERIC_OPTIONS]
           [-report [-live] [-dead] [-decommissioning]]
           [-safemode enter | leave | get | wait]
           [-saveNamespace]
@@ -169,6 +170,8 @@ HDFS Commands Guide
           [-clrQuota <dirname>...<dirname>]
           [-setSpaceQuota <quota> <dirname>...<dirname>]
           [-clrSpaceQuota <dirname>...<dirname>]
+          [-setStoragePolicy <path> <policyName>]
+          [-getStoragePolicy <path>]
           [-finalizeUpgrade]
           [-rollingUpgrade [<query>|<prepare>|<finalize>]]
           [-metasave filename]
@@ -186,7 +189,8 @@ HDFS Commands Guide
           [-fetchImage <local directory>]
           [-shutdownDatanode <datanode_host:ipc_port> [upgrade]]
           [-getDatanodeInfo <datanode_host:ipc_port>]
-          [-help [cmd]]>>>
+          [-help [cmd]]
++------------------------------------------+
 
 *-----------------+-----------------------------------------------------------+
 || COMMAND_OPTION || Description
@@ -236,6 +240,10 @@ HDFS Commands Guide
                   | 
{{{../hadoop-hdfs/HdfsQuotaAdminGuide.html#Administrative_Commands}HDFS Quotas 
Guide}}
                   | for the detail.
 *-----------------+-----------------------------------------------------------+
+| -setStoragePolicy \<path\> \<policyName\> | Set a storage policy to a file 
or a directory.
+*-----------------+-----------------------------------------------------------+
+| -getStoragePolicy \<path\> | Get the storage policy of a file or a directory.
+*-----------------+-----------------------------------------------------------+
 | -finalizeUpgrade| Finalize upgrade of HDFS. Datanodes delete their previous
                   | version working directories, followed by Namenode doing the
                   | same. This completes the upgrade process.
@@ -250,7 +258,7 @@ HDFS Commands Guide
                   | <filename> will contain one line for each of the following\
                   | 1. Datanodes heart beating with Namenode\
                   | 2. Blocks waiting to be replicated\
-                  | 3. Blocks currrently being replicated\
+                  | 3. Blocks currently being replicated\
                   | 4. Blocks waiting to be deleted
 *-----------------+-----------------------------------------------------------+
 | -refreshServiceAcl | Reload the service-level authorization policy file.
@@ -312,12 +320,30 @@ HDFS Commands Guide
                   | is specified.
 *-----------------+-----------------------------------------------------------+
 
+** <<<mover>>>
+
+   Runs the data migration utility.
+   See {{{./ArchivalStorage.html#Mover_-_A_New_Data_Migration_Tool}Mover}} for 
more details.
+
+   Usage: <<<hdfs mover [-p <files/dirs> | -f <local file name>]>>>
+
+*--------------------+--------------------------------------------------------+
+|| COMMAND_OPTION    || Description
+*--------------------+--------------------------------------------------------+
+| -p \<files/dirs\>  | Specify a space separated list of HDFS files/dirs to 
migrate.
+*--------------------+--------------------------------------------------------+
+| -f \<local file\>  | Specify a local file containing a list of HDFS 
files/dirs to migrate.
+*--------------------+--------------------------------------------------------+
+
+  Note that, when both -p and -f options are omitted, the default path is the 
root directory.
+
 ** <<<namenode>>>
 
    Runs the namenode. More info about the upgrade, rollback and finalize is at
    {{{./HdfsUserGuide.html#Upgrade_and_Rollback}Upgrade Rollback}}.
 
-   Usage: <<<hdfs namenode [-backup] |
++------------------------------------------+
+   Usage: hdfs namenode [-backup] |
           [-checkpoint] |
           [-format [-clusterid cid ] [-force] [-nonInteractive] ] |
           [-upgrade [-clusterid cid] [-renameReserved<k-v pairs>] ] |
@@ -329,7 +355,8 @@ HDFS Commands Guide
           [-initializeSharedEdits] |
           [-bootstrapStandby] |
           [-recover [-force] ] |
-          [-metadataVersion ]>>>
+          [-metadataVersion ]
++------------------------------------------+
 
 *--------------------+--------------------------------------------------------+
 || COMMAND_OPTION    || Description
@@ -351,7 +378,7 @@ HDFS Commands Guide
 | -upgradeOnly [-clusterid cid] [-renameReserved\<k-v pairs\>] | Upgrade the
                      | specified NameNode and then shutdown it.
 *--------------------+--------------------------------------------------------+
-| -rollback          | Rollsback the NameNode to the previous version. This
+| -rollback          | Rollback the NameNode to the previous version. This
                      | should be used after stopping the cluster and
                      | distributing the old Hadoop version.
 *--------------------+--------------------------------------------------------+

http://git-wip-us.apache.org/repos/asf/hadoop/blob/b014e83b/hadoop-project/src/site/site.xml
----------------------------------------------------------------------
diff --git a/hadoop-project/src/site/site.xml b/hadoop-project/src/site/site.xml
index a42aff0..991447f 100644
--- a/hadoop-project/src/site/site.xml
+++ b/hadoop-project/src/site/site.xml
@@ -93,6 +93,7 @@
       <item name="Extended Attributes" 
href="hadoop-project-dist/hadoop-hdfs/ExtendedAttributes.html"/>
       <item name="Transparent Encryption" 
href="hadoop-project-dist/hadoop-hdfs/TransparentEncryption.html"/>
       <item name="HDFS Support for Multihoming" 
href="hadoop-project-dist/hadoop-hdfs/HdfsMultihoming.html"/>
+      <item name="Archival Storage" 
href="hadoop-project-dist/hadoop-hdfs/ArchivalStorage.html"/>
     </menu>
 
     <menu name="MapReduce" inherit="top">

Reply via email to