[4/7] incubator-impala git commit: New files needed to make PDF build happy.

jrussell Fri, 28 Oct 2016 17:34:26 -0700

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/1fcc8cee/docs/topics/impala_kerberos.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_kerberos.xml b/docs/topics/impala_kerberos.xml
new file mode 100644
index 0000000..7c59185
--- /dev/null
+++ b/docs/topics/impala_kerberos.xml
@@ -0,0 +1,370 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="kerberos">
+
+  <title>Enabling Kerberos Authentication for Impala</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Security"/>
+      <data name="Category" value="Kerberos"/>
+      <data name="Category" value="Authentication"/>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Configuring"/>
+      <data name="Category" value="Starting and Stopping"/>
+      <data name="Category" value="Administrators"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      Impala supports Kerberos authentication. For more information on 
enabling Kerberos authentication, see the
+      topic on Configuring Hadoop Security in the
+      <xref 
href="http://www.cloudera.com/content/cloudera-content/cloudera-docs/CDH4/latest/CDH4-Security-Guide/cdh4sg_topic_3.html";
 scope="external" format="html">CDH4
+      Security Guide</xref> or the
+<!-- Original URL: 
http://www.cloudera.com/content/cloudera-content/cloudera-docs/CDH5/latest/CDH5-Security-Guide/cdh_sg_cdh5_hadoop_security.html
 -->
+      <xref 
href="http://www.cloudera.com/documentation/enterprise/latest/topics/cdh_sg_cdh5_hadoop_security.html";
 scope="external" format="html">CDH
+      5 Security Guide</xref>.
+    </p>
+
+    <p>
+      When using Impala in a managed environment, Cloudera Manager 
automatically completes Kerberos configuration.
+      In an unmanaged environment, create a Kerberos principal for each host 
running <cmdname>impalad</cmdname> or
+      <cmdname>statestored</cmdname>. Cloudera recommends using a consistent 
format, such as
+      <codeph>impala/_HOST@Your-Realm</codeph>, but you can use any three-part 
Kerberos server principal.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/user_kerberized"/>
+
+    <note 
conref="../shared/impala_common.xml#common/authentication_vs_authorization"/>
+
+    <p outputclass="toc inpage"/>
+
+    <p>
+      An alternative form of authentication you can use is LDAP, described in 
<xref href="impala_ldap.xml#ldap"/>.
+    </p>
+  </conbody>
+
+  <concept id="kerberos_prereqs">
+
+    <title>Requirements for Using Impala with Kerberos</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Requirements"/>
+      <data name="Category" value="Planning"/>
+    </metadata>
+  </prolog>
+
+    <conbody>
+
+      <p conref="../shared/impala_common.xml#common/rhel5_kerberos"/>
+
+<!-- This note adapted from the one at 
http://www.cloudera.com/content/cloudera-content/cloudera-docs/CDH4/latest/CDH4-Security-Guide/cdh4sg_topic_3_4.html.
+   Ideally should be conref'ed in both places. -->
+
+      <note type="important">
+        <p>
+          If you plan to use Impala in your cluster, you must configure your 
KDC to allow tickets to be renewed,
+          and you must configure <filepath>krb5.conf</filepath> to request 
renewable tickets. Typically, you can do
+          this by adding the <codeph>max_renewable_life</codeph> setting to 
your realm in
+          <filepath>kdc.conf</filepath>, and by adding the 
<filepath>renew_lifetime</filepath> parameter to the
+          <filepath>libdefaults</filepath> section of 
<filepath>krb5.conf</filepath>. For more information about
+          renewable tickets, see the
+          <xref href="http://web.mit.edu/Kerberos/krb5-1.8/"; scope="external" 
format="html"> Kerberos
+          documentation</xref>.
+        </p>
+        <p rev="1.2">
+          Currently, you cannot use the resource management feature in CDH 5 
on a cluster that has Kerberos
+          authentication enabled.
+        </p>
+      </note>
+
+      <p>
+        Start all <cmdname>impalad</cmdname> and 
<cmdname>statestored</cmdname> daemons with the
+        <codeph>--principal</codeph> and <codeph>--keytab-file</codeph> flags 
set to the principal and full path
+        name of the <codeph>keytab</codeph> file containing the credentials 
for the principal.
+      </p>
+
+      <p>
+        Impala supports the Cloudera ODBC driver and the Kerberos interface 
provided. To use Kerberos through the
+        ODBC driver, the host type must be set depending on the level of the 
ODBD driver:
+      </p>
+
+      <ul>
+        <li>
+          <codeph>SecImpala</codeph> for the ODBC 1.0 driver.
+        </li>
+
+        <li>
+          <codeph>SecBeeswax</codeph> for the ODBC 1.2 driver.
+        </li>
+
+        <li>
+          Blank for the ODBC 2.0 driver or higher, when connecting to a secure 
cluster.
+        </li>
+
+        <li>
+          <codeph>HS2NoSasl</codeph> for the ODBC 2.0 driver or higher, when 
connecting to a non-secure cluster.
+        </li>
+      </ul>
+
+      <p>
+        To enable Kerberos in the Impala shell, start the 
<cmdname>impala-shell</cmdname> command using the
+        <codeph>-k</codeph> flag.
+      </p>
+
+      <p>
+        To enable Impala to work with Kerberos security on your Hadoop 
cluster, make sure you perform the
+        installation and configuration steps in
+<!-- Original URL: 
http://www.cloudera.com/content/cloudera-content/cloudera-docs/CDH5/latest/CDH5-Security-Guide/CDH5-Security-Guide.html
 -->
+        <xref 
href="http://www.cloudera.com/documentation/enterprise/latest/topics/sg_authentication.html";
 scope="external" format="html">Authentication in the CDH 5 Security 
Guide</xref>
+        or
+        the topic on Configuring Hadoop Security in the <xref 
href="http://www.cloudera.com/content/cloudera-content/cloudera-docs/CDH4/latest/CDH4-Security-Guide/CDH4-Security-Guide.html";
 scope="external" format="html">CDH4 Security Guide</xref>.
+        Also note that when Kerberos security is enabled in Impala, a web 
browser that
+        supports Kerberos HTTP SPNEGO is required to access the Impala web 
console (for example, Firefox, Internet
+        Explorer, or Chrome).
+      </p>
+
+      <p>
+        If the NameNode, Secondary NameNode, DataNode, JobTracker, 
TaskTrackers, ResourceManager, NodeManagers,
+        HttpFS, Oozie, Impala, or Impala statestore services are configured to 
use Kerberos HTTP SPNEGO
+        authentication, and two or more of these services are running on the 
same host, then all of the running
+        services must use the same HTTP principal and keytab file used for 
their HTTP endpoints.
+      </p>
+    </conbody>
+  </concept>
+
+  <concept id="kerberos_config">
+
+    <title>Configuring Impala to Support Kerberos Security</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Configuring"/>
+    </metadata>
+  </prolog>
+
+    <conbody>
+
+      <p>
+        Enabling Kerberos authentication for Impala involves steps that can be 
summarized as follows:
+      </p>
+
+      <ul>
+        <li>
+          Creating service principals for Impala and the HTTP service. 
Principal names take the form:
+          
<codeph><varname>serviceName</varname>/<varname>fully.qualified.domain.name</varname>@<varname>KERBEROS.REALM</varname></codeph>
+        </li>
+
+        <li>
+          Creating, merging, and distributing key tab files for these 
principals.
+        </li>
+
+        <li>
+          Editing <codeph>/etc/default/impala</codeph> (in cluster not managed 
by Cloudera Manager), or editing the
+          <uicontrol>Security</uicontrol> settings in the Cloudera Manager 
interface, to accommodate Kerberos
+          authentication.
+        </li>
+      </ul>
+    </conbody>
+
+    <concept id="kerberos_setup">
+
+      <title>Enabling Kerberos for Impala</title>
+
+      <conbody>
+
+<!--
+      <p>
+        <b>To enable Kerberos for Impala:</b>
+      </p>
+-->
+
+        <ol>
+          <li>
+            Create an Impala service principal, specifying the name of the OS 
user that the Impala daemons run
+            under, the fully qualified domain name of each node running 
<cmdname>impalad</cmdname>, and the realm
+            name. For example:
+<codeblock>$ kadmin
+kadmin: addprinc -requires_preauth -randkey 
impala/[email protected]</codeblock>
+          </li>
+
+          <li>
+            Create an HTTP service principal. For example:
+<codeblock>kadmin: addprinc -randkey 
HTTP/[email protected]</codeblock>
+            <note>
+              The <codeph>HTTP</codeph> component of the service principal 
must be uppercase as shown in the
+              preceding example.
+            </note>
+          </li>
+
+          <li>
+            Create <codeph>keytab</codeph> files with both principals. For 
example:
+<codeblock>kadmin: xst -k impala.keytab impala/impala_host.example.com
+kadmin: xst -k http.keytab HTTP/impala_host.example.com
+kadmin: quit</codeblock>
+          </li>
+
+          <li>
+            Use <codeph>ktutil</codeph> to read the contents of the two keytab 
files and then write those contents
+            to a new file. For example:
+<codeblock>$ ktutil
+ktutil: rkt impala.keytab
+ktutil: rkt http.keytab
+ktutil: wkt impala-http.keytab
+ktutil: quit</codeblock>
+          </li>
+
+          <li>
+            (Optional) Test that credentials in the merged keytab file are 
valid, and that the <q>renew until</q>
+            date is in the future. For example:
+<codeblock>$ klist -e -k -t impala-http.keytab</codeblock>
+          </li>
+
+          <li>
+            Copy the <filepath>impala-http.keytab</filepath> file to the 
Impala configuration directory. Change the
+            permissions to be only read for the file owner and change the file 
owner to the <codeph>impala</codeph>
+            user. By default, the Impala user and group are both named 
<codeph>impala</codeph>. For example:
+<codeblock>$ cp impala-http.keytab /etc/impala/conf
+$ cd /etc/impala/conf
+$ chmod 400 impala-http.keytab
+$ chown impala:impala impala-http.keytab</codeblock>
+          </li>
+
+          <li>
+            Add Kerberos options to the Impala defaults file, 
<filepath>/etc/default/impala</filepath>. Add the
+            options for both the <cmdname>impalad</cmdname> and 
<cmdname>statestored</cmdname> daemons, using the
+            <codeph>IMPALA_SERVER_ARGS</codeph> and 
<codeph>IMPALA_STATE_STORE_ARGS</codeph> variables. For
+            example, you might add:
+<!-- Found these in a discussion post somewhere but not applicable as Impala 
startup options.
+-kerberos_ticket_life=36000
+-maxrenewlife 7days
+-->
+<codeblock>-kerberos_reinit_interval=60
+-principal=impala_1/[email protected]
+-keytab_file=/var/run/cloudera-scm-agent/process/3212-impala-IMPALAD/impala.keytab</codeblock>
+            <p>
+              For more information on changing the Impala defaults specified in
+              <filepath>/etc/default/impala</filepath>, see
+              <xref href="impala_config_options.xml#config_options">Modifying 
Impala Startup
+              Options</xref>.
+            </p>
+          </li>
+        </ol>
+
+        <note>
+          Restart <cmdname>impalad</cmdname> and 
<cmdname>statestored</cmdname> for these configuration changes to
+          take effect.
+        </note>
+      </conbody>
+    </concept>
+  </concept>
+
+  <concept id="kerberos_proxy">
+
+    <title>Enabling Kerberos for Impala with a Proxy Server</title>
+
+    <conbody>
+
+      <p>
+        A common configuration for Impala with High Availability is to use a 
proxy server to submit requests to the
+        actual <cmdname>impalad</cmdname> daemons on different hosts in the 
cluster. This configuration avoids
+        connection problems in case of machine failure, because the proxy 
server can route new requests through one
+        of the remaining hosts in the cluster. This configuration also helps 
with load balancing, because the
+        additional overhead of being the <q>coordinator node</q> for each 
query is spread across multiple hosts.
+      </p>
+
+      <p>
+        Although you can set up a proxy server with or without Kerberos 
authentication, typically users set up a
+        secure Kerberized configuration. For information about setting up a 
proxy server for Impala, including
+        Kerberos-specific steps, see <xref href="impala_proxy.xml#proxy"/>.
+      </p>
+    </conbody>
+  </concept>
+
+  <concept id="spnego">
+
+    <title>Using a Web Browser to Access a URL Protected by Kerberos HTTP 
SPNEGO</title>
+
+    <conbody>
+
+      <p>
+        Your web browser must support Kerberos HTTP SPNEGO. For example, 
Chrome, Firefox, or Internet Explorer.
+      </p>
+
+      <p>
+        <b>To configure Firefox to access a URL protected by Kerberos HTTP 
SPNEGO:</b>
+      </p>
+
+      <ol>
+        <li>
+          Open the advanced settings Firefox configuration page by loading the 
<codeph>about:config</codeph> page.
+        </li>
+
+        <li>
+          Use the <b>Filter</b> text box to find 
<codeph>network.negotiate-auth.trusted-uris</codeph>.
+        </li>
+
+        <li>
+          Double-click the 
<codeph>network.negotiate-auth.trusted-uris</codeph> preference and enter the 
hostname
+          or the domain of the web server that is protected by Kerberos HTTP 
SPNEGO. Separate multiple domains and
+          hostnames with a comma.
+        </li>
+
+        <li>
+          Click <b>OK</b>.
+        </li>
+      </ol>
+    </conbody>
+  </concept>
+
+  <concept id="kerberos_delegation">
+    <title>Enabling Impala Delegation for Kerberos Users</title>
+    <conbody>
+      <p>
+        See <xref href="impala_delegation.xml#delegation"/> for details about 
the delegation feature
+        that lets certain users submit queries using the credentials of other 
users.
+      </p>
+    </conbody>
+  </concept>
+
+  <concept id="ssl_jdbc_odbc">
+    <title>Using TLS/SSL with Business Intelligence Tools</title>
+    <conbody>
+      <p>
+        You can use Kerberos authentication, TLS/SSL encryption, or both to 
secure
+        connections from JDBC and ODBC applications to Impala.
+        See <xref href="impala_jdbc.xml#impala_jdbc"/> and <xref 
href="impala_odbc.xml#impala_odbc"/>
+        for details.
+      </p>
+
+      <p 
conref="../shared/impala_common.xml#common/hive_jdbc_ssl_kerberos_caveat"/>
+    </conbody>
+  </concept>
+
+  <concept id="whitelisting_internal_apis">
+  <title>Enabling Access to Internal Impala APIs for Kerberos Users</title>
+    <conbody>
+    <!-- Reusing (most of) the text from the New Features bullet here. Turn 
into a conref in both places. -->
+      <p rev="IMPALA-3095">
+        For applications that need direct access
+        to Impala APIs, without going through the HiveServer2 or Beeswax 
interfaces, you can
+        specify a list of Kerberos users who are allowed to call those APIs. 
By default, the
+        <codeph>impala</codeph> and <codeph>hdfs</codeph> users are the only 
ones authorized
+        for this kind of access.
+        Any users not explicitly authorized through the 
<codeph>internal_principals_whitelist</codeph>
+        configuration setting are blocked from accessing the APIs. This 
setting applies to all the
+        Impala-related daemons, although currently it is primarily used for 
HDFS to control the
+        behavior of the catalog server.
+      </p>
+    </conbody>
+
+  </concept>
+
+  <concept id="auth_to_local" rev="IMPALA-2660 CDH-40241">
+    <title>Mapping Kerberos Principals to Short Names for Impala</title>
+    <conbody>
+      <p 
conref="../shared/impala_common.xml#common/auth_to_local_instructions"/>
+    </conbody>
+  </concept>
+
+</concept>


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/1fcc8cee/docs/topics/impala_ldap.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_ldap.xml b/docs/topics/impala_ldap.xml
new file mode 100644
index 0000000..f2ef523
--- /dev/null
+++ b/docs/topics/impala_ldap.xml
@@ -0,0 +1,354 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="ldap">
+
+  <title>Enabling LDAP Authentication for Impala</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Security"/>
+      <data name="Category" value="LDAP"/>
+      <data name="Category" value="Authentication"/>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Configuring"/>
+      <data name="Category" value="Starting and Stopping"/>
+      <data name="Category" value="Administrators"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+<!-- Similar discussion under 'Authentication' parent topic. Maybe do some 
conref'ing or linking upward. -->
+
+    <p> Authentication is the process of allowing only specified named users to
+      access the server (in this case, the Impala server). This feature is
+      crucial for any production deployment, to prevent misuse, tampering, or
+      excessive load on the server. Impala uses LDAP for authentication,
+      verifying the credentials of each user who connects through
+        <cmdname>impala-shell</cmdname>, Hue, a Business Intelligence tool, 
JDBC
+      or ODBC application, and so on. </p>
+
+    <note 
conref="../shared/impala_common.xml#common/authentication_vs_authorization"/>
+
+    <p outputclass="toc inpage"/>
+
+    <p>
+      An alternative form of authentication you can use is Kerberos, described 
in
+      <xref href="impala_kerberos.xml#kerberos"/>.
+    </p>
+  </conbody>
+
+  <concept id="ldap_prereqs">
+
+    <title>Requirements for Using Impala with LDAP</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Requirements"/>
+      <data name="Category" value="Planning"/>
+    </metadata>
+  </prolog>
+
+    <conbody>
+
+      <p rev="1.4.0">
+        Authentication against LDAP servers is available in Impala 1.2.2 and 
higher. Impala 1.4.0 adds support for
+        secure LDAP authentication through SSL and TLS.
+      </p>
+
+      <p>
+        The Impala LDAP support lets you use Impala with systems such as 
Active Directory that use LDAP behind the
+        scenes.
+      </p>
+    </conbody>
+  </concept>
+
+  <concept id="ldap_client_server">
+
+    <title>Client-Server Considerations for LDAP</title>
+
+    <conbody>
+
+      <p>
+        Only client-&gt;Impala connections can be authenticated by LDAP.
+      </p>
+
+      <p> You must use the Kerberos authentication mechanism for connections
+        between internal Impala components, such as between the
+          <cmdname>impalad</cmdname>, <cmdname>statestored</cmdname>, and
+          <cmdname>catalogd</cmdname> daemons. See <xref
+          href="impala_kerberos.xml#kerberos" /> on how to set up Kerberos for
+        Impala. </p>
+    </conbody>
+  </concept>
+
+  <concept id="ldap_config">
+
+    <title>Server-Side LDAP Setup</title>
+
+    <conbody>
+
+      <p>
+        These requirements apply on the server side when configuring and 
starting Impala:
+      </p>
+
+      <p>
+        To enable LDAP authentication, set the following startup options for 
<cmdname>impalad</cmdname>:
+      </p>
+
+      <ul>
+        <li>
+          <codeph>--enable_ldap_auth</codeph> enables LDAP-based 
authentication between the client and Impala.
+        </li>
+
+        <li rev="1.4.0">
+          <codeph>--ldap_uri</codeph> sets the URI of the LDAP server to use. 
Typically, the URI is prefixed with
+          <codeph>ldap://</codeph>. In Impala 1.4.0 and higher, you can 
specify secure SSL-based LDAP transport by
+          using the prefix <codeph>ldaps://</codeph>. The URI can optionally 
specify the port, for example:
+          <codeph>ldap://ldap_server.cloudera.com:389</codeph> or
+          <codeph>ldaps://ldap_server.cloudera.com:636</codeph>. (389 and 636 
are the default ports for non-SSL and
+          SSL LDAP connections, respectively.)
+        </li>
+
+<!-- Some amount of this bullet could be conref'ed. Similar but not identical 
bullet occurs later under TLS. -->
+
+        <li rev="1.4.0">
+          For <codeph>ldaps://</codeph> connections secured by SSL,
+          
<codeph>--ldap_ca_certificate="<varname>/path/to/certificate/pem</varname>"</codeph>
 specifies the
+          location of the certificate in standard <codeph>.PEM</codeph> 
format. Store this certificate on the local
+          filesystem, in a location that only the <codeph>impala</codeph> user 
and other trusted users can read.
+        </li>
+
+<!-- Per Henry: not for public consumption.
+<li>
+  If you need to provide a custom SASL configuration,
+  set <codeph>- -ldap_manual_config</codeph> to bypass all the automatic 
configuration.
+</li>
+-->
+      </ul>
+    </conbody>
+  </concept>
+
+  <concept id="ldap_bind_strings">
+
+    <title>Support for Custom Bind Strings</title>
+
+    <conbody>
+
+      <p>
+        When Impala connects to LDAP it issues a bind call to the LDAP server 
to authenticate as the connected
+        user. Impala clients, including the Impala shell, provide the short 
name of the user to Impala. This is
+        necessary so that Impala can use Sentry for role-based access, which 
uses short names.
+      </p>
+
+      <p>
+        However, LDAP servers often require more complex, structured usernames 
for authentication. Impala supports
+        three ways of transforming the short name (for example, 
<codeph>'henry'</codeph>) to a more complicated
+        string. If necessary, specify one of the following configuration 
options when starting the
+        <cmdname>impalad</cmdname> daemon on each DataNode:
+      </p>
+
+      <ul>
+        <li>
+          <codeph>--ldap_domain</codeph>: Replaces the username with a string
+          
<codeph><varname>username</varname>@<varname>ldap_domain</varname></codeph>.
+        </li>
+
+        <li>
+          <codeph>--ldap_baseDN</codeph>: Replaces the username with a 
<q>distinguished name</q> (DN) of the form:
+          <codeph>uid=<varname>userid</varname>,ldap_baseDN</codeph>. (This is 
equivalent to a Hive option).
+        </li>
+
+        <li>
+          <codeph>--ldap_bind_pattern</codeph>: This is the most general 
option, and replaces the username with the
+          string <varname>ldap_bind_pattern</varname> where all instances of 
the string <codeph>#UID</codeph> are
+          replaced with <varname>userid</varname>. For example, an 
<codeph>ldap_bind_pattern</codeph> of
+          <codeph>"user=#UID,OU=foo,CN=bar"</codeph> with a username of 
<codeph>henry</codeph> will construct a
+          bind name of <codeph>"user=henry,OU=foo,CN=bar"</codeph>.
+        </li>
+      </ul>
+
+      <p rev="CDH-26854">
+        For clusters not managed by Cloudera Manager,
+        specify the option on the <cmdname>impalad</cmdname> command line.
+        For clusters managed by Cloudera Manager 5.4.0 and higher,
+        search for the configuration field names <codeph>ldap_domain</codeph>,
+        <codeph>ldap_basedn</codeph>, or <codeph>ldap_bind_pattern</codeph>,
+        fill in and save the appropriate field values, and restart the Impala 
service.
+        Prior to Cloudera Manager 5.4.0, these values were filled in using the
+        <uicontrol>Impala Daemon Command Line Argument Advanced Configuration 
Snippet (Safety Valve)</uicontrol>
+        field.
+      </p>
+
+      <p>
+        These options are mutually exclusive; Impala does not start if more 
than one of these options is specified.
+      </p>
+    </conbody>
+  </concept>
+
+  <concept id="ldap_security">
+
+    <title>Secure LDAP Connections</title>
+
+    <conbody>
+
+      <p>
+        To avoid sending credentials over the wire in cleartext, you must 
configure a secure connection between
+        both the client and Impala, and between Impala and the LDAP server. 
The secure connection could use SSL or
+        TLS.
+      </p>
+
+      <p>
+        <b>Secure LDAP connections through SSL:</b>
+      </p>
+
+      <p>
+        For SSL-enabled LDAP connections, specify a prefix of 
<codeph>ldaps://</codeph> instead of
+        <codeph>ldap://</codeph>. Also, the default port for SSL-enabled LDAP 
connections is 636 instead of 389.
+      </p>
+
+      <p rev="1.4.0">
+        <b>Secure LDAP connections through TLS:</b>
+      </p>
+
+      <p>
+        <xref href="http://en.wikipedia.org/wiki/Transport_Layer_Security"; 
scope="external" format="html">TLS</xref>,
+        the successor to the SSL protocol, is supported by most modern LDAP 
servers. Unlike SSL connections, TLS
+        connections can be made on the same server port as non-TLS 
connections. To secure all connections using
+        TLS, specify the following flags as startup options to the 
<cmdname>impalad</cmdname> daemon:
+      </p>
+
+      <ul>
+        <li>
+          <codeph>--ldap_tls</codeph> tells Impala to start a TLS connection 
to the LDAP server, and to fail
+          authentication if it cannot be done.
+        </li>
+
+        <li rev="1.4.0">
+          
<codeph>--ldap_ca_certificate="<varname>/path/to/certificate/pem</varname>"</codeph>
 specifies the
+          location of the certificate in standard <codeph>.PEM</codeph> 
format. Store this certificate on the local
+          filesystem, in a location that only the <codeph>impala</codeph> user 
and other trusted users can read.
+        </li>
+      </ul>
+    </conbody>
+  </concept>
+
+  <concept id="ldap_impala_shell">
+
+    <title>LDAP Authentication for impala-shell Interpreter</title>
+
+    <conbody>
+
+      <p>
+        To connect to Impala using LDAP authentication, you specify 
command-line options to the
+        <cmdname>impala-shell</cmdname> command interpreter and enter the 
password when prompted:
+      </p>
+
+      <ul>
+        <li>
+          <codeph>-l</codeph> enables LDAP authentication.
+        </li>
+
+        <li>
+          <codeph>-u</codeph> sets the user. Per Active Directory, the user is 
the short username, not the full
+          LDAP distinguished name. If your LDAP settings include a search 
base, use the
+          <codeph>--ldap_bind_pattern</codeph> on the 
<cmdname>impalad</cmdname> daemon to translate the short user
+          name from <cmdname>impala-shell</cmdname> automatically to the fully 
qualified name.
+<!--
+include that as part of the
+username, for example <codeph>[email protected]</codeph>.
+-->
+        </li>
+
+        <li>
+          <cmdname>impala-shell</cmdname> automatically prompts for the 
password.
+        </li>
+      </ul>
+
+      <p>
+        For the full list of available <cmdname>impala-shell</cmdname> 
options, see
+        <xref href="impala_shell_options.xml#shell_options"/>.
+      </p>
+
+      <p>
+        <b>LDAP authentication for JDBC applications:</b> See <xref 
href="impala_jdbc.xml#impala_jdbc"/> for the
+        format to use with the JDBC connection string for servers using LDAP 
authentication.
+      </p>
+    </conbody>
+  </concept>
+  <concept id="ldap_impala_hue">
+    <title>Enabling LDAP for Impala in Hue</title>
+    <prolog>
+      <metadata>
+        <data name="Category" value="Hue"/>
+      </metadata>
+    </prolog>
+    <conbody>
+      <section id="ldap_impala_hue_cm">
+        <title>Enabling LDAP for Impala in Hue Using Cloudera Manager</title>
+        <p>
+          <ol>
+            <li>Go to the Hue service.</li>
+            <li>Click the Configuration tab.</li>
+            <li>Select <menucascade><uicontrol>Scope</uicontrol><uicontrol>Hue
+                  Server</uicontrol></menucascade>.</li>
+            <li>Select
+              
<menucascade><uicontrol>Category</uicontrol><uicontrol>Advanced</uicontrol></menucascade>.</li>
+            <li>Add the following properties to the <b>Hue Server Advanced
+                Configuration Snippet (Safety Valve) for
+                hue_safety_valve_server.ini</b>
+              property.<codeblock>[impala]
+auth_username=&lt;LDAP username of Hue user to be authenticated>
+auth_password=&lt;LDAP password of Hue user to be 
authenticated></codeblock></li>
+            <li>Click <b>Save Changes</b>.</li>
+          </ol>
+        </p>
+      </section>
+      <section id="ldap_impala_hue_cmdline">
+        <title>Enabling LDAP for Impala in Hue Using the Command Line</title>
+        <p>LDAP authentication for the Impala app in Hue can be enabled by
+          setting the following properties under the <codeph>[impala]</codeph>
+          section in <codeph>hue.ini</codeph>. <table 
id="ldap_impala_hue_configs">
+            <tgroup cols="2">
+              <colspec colname="1" colwidth="1*" />
+              <colspec colname="2" colwidth="2*" />
+              <tbody>
+                <row>
+                  <entry><codeph>auth_username</codeph></entry>
+                  <entry>LDAP username of Hue user to be authenticated.</entry>
+                </row>
+                <row>
+                  <entry><codeph>auth_password</codeph></entry>
+                  <entry>
+                    <p>LDAP password of Hue user to be authenticated.</p>
+                  </entry>
+                </row>
+              </tbody>
+            </tgroup>
+          </table>These login details are only used by Impala to authenticate 
to
+          LDAP. The Impala service trusts Hue to have already validated the 
user
+          being impersonated, rather than simply passing on the 
credentials.</p>
+      </section>
+    </conbody>
+  </concept>
+
+  <concept id="ldap_delegation">
+    <title>Enabling Impala Delegation for LDAP Users</title>
+    <conbody>
+      <p>
+        See <xref href="impala_delegation.xml#delegation"/> for details about 
the delegation feature
+        that lets certain users submit queries using the credentials of other 
users.
+      </p>
+    </conbody>
+  </concept>
+
+  <concept id="ldap_restrictions">
+
+    <title>LDAP Restrictions for Impala</title>
+
+    <conbody>
+
+      <p>
+        The LDAP support is preliminary. It currently has only been tested 
against Active Directory.
+      </p>
+    </conbody>
+  </concept>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/1fcc8cee/docs/topics/impala_lineage.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_lineage.xml b/docs/topics/impala_lineage.xml
new file mode 100644
index 0000000..c05391c
--- /dev/null
+++ b/docs/topics/impala_lineage.xml
@@ -0,0 +1,113 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="lineage" rev="2.2.0">
+
+  <title>Viewing Lineage Information for Impala Data</title>
+  <titlealts audience="PDF"><navtitle>Viewing Lineage 
Info</navtitle></titlealts>
+  <prolog>
+
+    <metadata>
+
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Lineage"/>
+      <data name="Category" value="Governance"/>
+      <data name="Category" value="Data Management"/>
+      <data name="Category" value="Navigator"/>
+      <data name="Category" value="Administrators"/>
+
+    </metadata>
+
+  </prolog>
+
+  <conbody>
+
+    <p rev="2.2.0">
+      <indexterm audience="Cloudera">lineage</indexterm>
+      <indexterm audience="Cloudera">column lineage</indexterm>
+      <term>Lineage</term> is a feature in the Cloudera Navigator data
+      management component that helps you track where data originated, and how
+      data propagates through the system through SQL statements such as
+        <codeph>SELECT</codeph>, <codeph>INSERT</codeph>, and <codeph>CREATE
+        TABLE AS SELECT</codeph>. Impala is covered by the Cloudera Navigator
+      lineage features in CDH 5.4.0 and higher. </p>
+
+    <p>
+      This type of tracking is important in high-security configurations, 
especially in highly regulated industries
+      such as healthcare, pharmaceuticals, financial services and 
intelligence. For such kinds of sensitive data, it is important to know all
+      the places in the system that contain that data or other data derived 
from it; to verify who has accessed
+      that data; and to be able to doublecheck that the data used to make a 
decision was processed correctly and
+      not tampered with.
+    </p>
+
+    <p>
+      You interact with this feature through <term>lineage diagrams</term> 
showing relationships between tables and
+      columns. For instructions about interpreting lineage diagrams, see
+      <xref audience="integrated" href="cn_iu_lineage.xml" /><xref 
audience="standalone" 
href="http://www.cloudera.com/documentation/enterprise/latest/topics/cn_iu_lineage.html";
 scope="external" format="html"/>.
+    </p>
+
+    <section id="column_lineage">
+
+      <title>Column Lineage</title>
+
+      <p>
+        <term>Column lineage</term> tracks information in fine detail, at the 
level of
+        particular columns rather than entire tables.
+      </p>
+
+      <p>
+        For example, if you have a table with information derived from web 
logs, you might copy that data into
+        other tables as part of the ETL process. The ETL operations might 
involve transformations through
+        expressions and function calls, and rearranging the columns into more 
or fewer tables
+        (<term>normalizing</term> or <term>denormalizing</term> the data). 
Then for reporting, you might issue
+        queries against multiple tables and views. In this example, column 
lineage helps you determine that data
+        that entered the system as <codeph>RAW_LOGS.FIELD1</codeph> was then 
turned into
+        <codeph>WEBSITE_REPORTS.IP_ADDRESS</codeph> through an <codeph>INSERT 
... SELECT</codeph> statement. Or,
+        conversely, you could start with a reporting query against a view, and 
trace the origin of the data in a
+        field such as <codeph>TOP_10_VISITORS.USER_ID</codeph> back to the 
underlying table and even further back
+        to the point where the data was first loaded into Impala.
+      </p>
+
+      <p>
+        When you have tables where you need to track or control access to 
sensitive information at the column
+        level, see <xref href="impala_authorization.xml#authorization"/> for 
how to implement column-level
+        security. You set up authorization using the Sentry framework, create 
views that refer to specific sets of
+        columns, and then assign authorization privileges to those views 
rather than the underlying tables.
+      </p>
+
+    </section>
+
+    <section id="lineage_data">
+
+      <title>Lineage Data for Impala</title>
+
+      <p>
+        The lineage feature is enabled by default. When lineage logging is 
enabled, the serialized column lineage
+        graph is computed for each query and stored in a specialized log file 
in JSON format.
+      </p>
+
+      <p>
+        Impala records queries in the lineage log if they complete 
successfully, or fail due to authorization
+        errors. For write operations such as <codeph>INSERT</codeph> and 
<codeph>CREATE TABLE AS SELECT</codeph>,
+        the statement is recorded in the lineage log only if it successfully 
completes. Therefore, the lineage
+        feature tracks data that was accessed by successful queries, or that 
was attempted to be accessed by
+        unsuccessful queries that were blocked due to authorization failure. 
These kinds of queries represent data
+        that really was accessed, or where the attempted access could 
represent malicious activity.
+      </p>
+
+      <p>
+        Impala does not record in the lineage log queries that fail due to 
syntax errors or that fail or are
+        cancelled before they reach the stage of requesting rows from the 
result set.
+      </p>
+
+      <p>
+        To enable or disable this feature on a system not managed by Cloudera 
Manager, set or remove the
+        <codeph>-lineage_event_log_dir</codeph> configuration option for the 
<cmdname>impalad</cmdname> daemon. For
+        information about turning the lineage feature on and off through 
Cloudera Manager, see
+        <xref audience="integrated" 
href="datamgmt_impala_lineage_log.xml"/><xref audience="standalone" 
href="http://www.cloudera.com/documentation/enterprise/latest/topics/datamgmt_impala_lineage_log.html";
 scope="external" format="html"/>.
+      </p>
+
+    </section>
+
+  </conbody>
+
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/1fcc8cee/docs/topics/impala_mixed_security.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_mixed_security.xml 
b/docs/topics/impala_mixed_security.xml
new file mode 100644
index 0000000..b9e6933
--- /dev/null
+++ b/docs/topics/impala_mixed_security.xml
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="mixed_security">
+
+  <title>Using Multiple Authentication Methods with Impala</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Security"/>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Authentication"/>
+      <data name="Category" value="Kerberos"/>
+      <data name="Category" value="LDAP"/>
+      <data name="Category" value="Administrators"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      Impala 2.0 and later automatically handles both Kerberos and LDAP 
authentication. Each
+      <cmdname>impalad</cmdname> daemon can accept both Kerberos and LDAP 
requests through the same port. No
+      special actions need to be taken if some users authenticate through 
Kerberos and some through LDAP.
+    </p>
+
+    <p>
+      Prior to Impala 2.0, you had to configure each 
<cmdname>impalad</cmdname> to listen on a specific port
+      depending on the kind of authentication, then configure your network 
load balancer to forward each kind of
+      request to a DataNode that was set up with the appropriate 
authentication type. Once the initial request was
+      made using either Kerberos or LDAP authentication, Impala automatically 
handled the process of coordinating
+      the work across multiple nodes and transmitting intermediate results 
back to the coordinator node.
+    </p>
+
+<!--
+    <p>
+    This technique is most suitable for larger clusters, where
+    you are already using load balancing software for high availability.
+    You configure Impala to run on a different port on the nodes configured 
for LDAP.
+    Then you configure the load balancing software to forward Kerberos
+    connection requests to nodes using the default port, and LDAP connection 
requests
+    to nodes using an alternative port for LDAP.
+    Consult the documentation for your load balancing software for how to
+    configure that type of forwarding.
+    </p>
+-->
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/1fcc8cee/docs/topics/impala_noncm_installation.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_noncm_installation.xml 
b/docs/topics/impala_noncm_installation.xml
new file mode 100644
index 0000000..b1ee0ef
--- /dev/null
+++ b/docs/topics/impala_noncm_installation.xml
@@ -0,0 +1,175 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="noncm_installation">
+
+  <title>Installing Impala without Cloudera Manager</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Installing"/>
+      <data name="Category" value="Administrators"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      Before installing Impala manually, make sure all applicable nodes have 
the appropriate hardware
+      configuration, levels of operating system and CDH, and any other 
software prerequisites. See
+      <xref href="impala_prereqs.xml#prereqs"/> for details.
+    </p>
+
+    <p>
+      You can install Impala across many hosts or on one host:
+    </p>
+
+    <ul>
+      <li>
+        Installing Impala across multiple machines creates a distributed 
configuration. For best performance,
+        install Impala on <b>all</b> DataNodes.
+      </li>
+
+      <li>
+        Installing Impala on a single machine produces a pseudo-distributed 
cluster.
+      </li>
+    </ul>
+
+    <p>
+      <b>To install Impala on a host:</b>
+    </p>
+
+    <ol>
+      <li>
+        Install CDH as described in the Installation section of the
+        <xref 
href="http://www.cloudera.com/content/cloudera-content/cloudera-docs/CDH4/latest/CDH4-Installation-Guide/CDH4-Installation-Guide.html";
 scope="external" format="html">CDH
+        4 Installation Guide</xref> or the
+<!-- Original URL: 
http://www.cloudera.com/content/cloudera-content/cloudera-docs/CDH5/latest/CDH5-Installation-Guide/CDH5-Installation-Guide.html
 -->
+        <xref 
href="http://www.cloudera.com/documentation/enterprise/latest/topics/installation.html";
 scope="external" format="html">CDH
+        5 Installation Guide</xref>.
+      </li>
+
+      <li>
+        <p>
+          Install the Hive metastore somewhere in your cluster, as described 
in the Hive Installation topic in the
+          <xref 
href="http://www.cloudera.com/content/cloudera-content/cloudera-docs/CDH4/latest/CDH4-Installation-Guide/cdh4ig_topic_18.html";
 scope="external" format="html">CDH
+          4 Installation Guide</xref> or the
+<!-- Original URL: 
http://www.cloudera.com/content/cloudera-content/cloudera-docs/CDH5/latest/CDH5-Installation-Guide/cdh_ig_hive_installation.html
 -->
+          <xref 
href="http://www.cloudera.com/documentation/enterprise/latest/topics/cdh_ig_hive_installation.html";
 scope="external" format="html">CDH
+          5 Installation Guide</xref>. As part of this process, you configure 
the Hive metastore to use an external
+          database as a metastore. Impala uses this same database for its own 
table metadata. You can choose either
+          a MySQL or PostgreSQL database as the metastore. The process for 
configuring each type of database is
+          described in the CDH Installation Guide).
+        </p>
+        <p>
+          Cloudera recommends setting up a Hive metastore service rather than 
connecting directly to the metastore
+          database; this configuration is required when running Impala under 
CDH 4.1. Make sure the
+          <filepath>/etc/impala/conf/hive-site.xml</filepath> file contains 
the following setting, substituting the
+          appropriate hostname for <varname>metastore_server_host</varname>:
+        </p>
+<codeblock>&lt;property&gt;
+&lt;name&gt;hive.metastore.uris&lt;/name&gt;
+&lt;value&gt;thrift://<varname>metastore_server_host</varname>:9083&lt;/value&gt;
+&lt;/property&gt;
+&lt;property&gt;
+&lt;name&gt;hive.metastore.client.socket.timeout&lt;/name&gt;
+&lt;value&gt;3600&lt;/value&gt;
+&lt;description&gt;MetaStore Client socket timeout in 
seconds&lt;/description&gt;
+&lt;/property&gt;</codeblock>
+      </li>
+
+      <li>
+        (Optional) If you installed the full Hive component on any host, you 
can verify that the metastore is
+        configured properly by starting the Hive console and querying for the 
list of available tables. Once you
+        confirm that the console starts, exit the console to continue the 
installation:
+<codeblock>$ hive
+Hive history file=/tmp/root/hive_job_log_root_201207272011_678722950.txt
+hive&gt; show tables;
+table1
+table2
+hive&gt; quit;
+$</codeblock>
+      </li>
+
+      <li>
+        Confirm that your package management command is aware of the Impala 
repository settings, as described in
+        <xref href="impala_prereqs.xml#prereqs"/>. (For CDH 4, this is a 
different repository than for CDH.) You
+        might need to download a repo or list file into a system directory 
underneath <filepath>/etc</filepath>.
+      </li>
+
+      <li>
+        Use <b>one</b> of the following sets of commands to install the Impala 
package:
+        <p>
+          <b>For RHEL, Oracle Linux, or CentOS systems:</b>
+        </p>
+<codeblock rev="1.2">$ sudo yum install impala             # Binaries for 
daemons
+$ sudo yum install impala-server      # Service start/stop script
+$ sudo yum install impala-state-store # Service start/stop script
+$ sudo yum install impala-catalog     # Service start/stop script
+</codeblock>
+        <p>
+          <b>For SUSE systems:</b>
+        </p>
+<codeblock rev="1.2">$ sudo zypper install impala             # Binaries for 
daemons
+$ sudo zypper install impala-server      # Service start/stop script
+$ sudo zypper install impala-state-store # Service start/stop script
+$ sudo zypper install impala-catalog     # Service start/stop script
+</codeblock>
+        <p>
+          <b>For Debian or Ubuntu systems:</b>
+        </p>
+<codeblock rev="1.2">$ sudo apt-get install impala             # Binaries for 
daemons
+$ sudo apt-get install impala-server      # Service start/stop script
+$ sudo apt-get install impala-state-store # Service start/stop script
+$ sudo apt-get install impala-catalog     # Service start/stop script
+</codeblock>
+        <note>
+          Cloudera recommends that you not install Impala on any HDFS 
NameNode. Installing Impala on NameNodes
+          provides no additional data locality, and executing queries with 
such a configuration might cause memory
+          contention and negatively impact the HDFS NameNode.
+        </note>
+      </li>
+
+      <li>
+        Copy the client <codeph>hive-site.xml</codeph>, 
<codeph>core-site.xml</codeph>,
+        <codeph>hdfs-site.xml</codeph>, and <codeph>hbase-site.xml</codeph> 
configuration files to the Impala
+        configuration directory, which defaults to 
<codeph>/etc/impala/conf</codeph>. Create this directory if it
+        does not already exist.
+      </li>
+
+      <li>
+        Use <b>one</b> of the following commands to install 
<codeph>impala-shell</codeph> on the machines from
+        which you want to issue queries. You can install 
<codeph>impala-shell</codeph> on any supported machine
+        that can connect to DataNodes that are running 
<codeph>impalad</codeph>.
+        <p>
+          <b>For RHEL/CentOS systems:</b>
+        </p>
+<codeblock>$ sudo yum install impala-shell</codeblock>
+        <p>
+          <b>For SUSE systems:</b>
+        </p>
+<codeblock>$ sudo zypper install impala-shell</codeblock>
+        <p>
+          <b>For Debian/Ubuntu systems:</b>
+        </p>
+<codeblock>$ sudo apt-get install impala-shell</codeblock>
+      </li>
+
+      <li>
+        Complete any required or recommended configuration, as described in
+        <xref href="impala_config_performance.xml#config_performance"/>. Some 
of these configuration changes are
+        mandatory. (They are applied automatically when you install using 
Cloudera Manager.)
+      </li>
+    </ol>
+
+    <p>
+      Once installation and configuration are complete, see <xref 
href="impala_processes.xml#processes"/> for how
+      to activate the software on the appropriate nodes in your cluster.
+    </p>
+
+    <p>
+      If this is your first time setting up and using Impala in this cluster, 
run through some of the exercises in
+      <xref href="impala_tutorial.xml#tutorial"/> to verify that you can do 
basic operations such as creating
+      tables and querying them.
+    </p>
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/1fcc8cee/docs/topics/impala_perf_benchmarking.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_perf_benchmarking.xml 
b/docs/topics/impala_perf_benchmarking.xml
new file mode 100644
index 0000000..b2e058d
--- /dev/null
+++ b/docs/topics/impala_perf_benchmarking.xml
@@ -0,0 +1,36 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="perf_benchmarks">
+
+  <title>Benchmarking Impala Queries</title>
+  <titlealts audience="PDF"><navtitle>Benchmarking</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Performance"/>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Querying"/>
+      <data name="Category" value="Proof of Concept"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      Because Impala, like other Hadoop components, is designed to handle 
large data volumes in a distributed
+      environment, conduct any performance tests using realistic data and 
cluster configurations. Use a multi-node
+      cluster rather than a single node; run queries against tables containing 
terabytes of data rather than tens
+      of gigabytes. The parallel processing techniques used by Impala are most 
appropriate for workloads that are
+      beyond the capacity of a single server.
+    </p>
+
+    <p>
+      When you run queries returning large numbers of rows, the CPU time to 
pretty-print the output can be
+      substantial, giving an inaccurate measurement of the actual query time. 
Consider using the
+      <codeph>-B</codeph> option on the <codeph>impala-shell</codeph> command 
to turn off the pretty-printing, and
+      optionally the <codeph>-o</codeph> option to store query results in a 
file rather than printing to the
+      screen. See <xref href="impala_shell_options.xml#shell_options"/> for 
details.
+    </p>
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/1fcc8cee/docs/topics/impala_perf_cookbook.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_perf_cookbook.xml 
b/docs/topics/impala_perf_cookbook.xml
new file mode 100644
index 0000000..a42f7c9
--- /dev/null
+++ b/docs/topics/impala_perf_cookbook.xml
@@ -0,0 +1,269 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="perf_cookbook">
+
+  <title>Impala Performance Guidelines and Best Practices</title>
+  <titlealts audience="PDF"><navtitle>Performance Best 
Practices</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Performance"/>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Planning"/>
+      <data name="Category" value="Proof of Concept"/>
+      <data name="Category" value="Guidelines"/>
+      <data name="Category" value="Best Practices"/>
+      <data name="Category" value="Proof of Concept"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      Here are performance guidelines and best practices that you can use 
during planning, experimentation, and
+      performance tuning for an Impala-enabled CDH cluster. All of this 
information is also available in more
+      detail elsewhere in the Impala documentation; it is gathered together 
here to serve as a cookbook and
+      emphasize which performance techniques typically provide the highest 
return on investment
+    </p>
+
+    <p outputclass="toc inpage"/>
+
+    <section id="perf_cookbook_file_format">
+
+      <title>Choose the appropriate file format for the data.</title>
+
+      <p>
+        Typically, for large volumes of data (multiple gigabytes per table or 
partition), the Parquet file format
+        performs best because of its combination of columnar storage layout, 
large I/O request size, and
+        compression and encoding. See <xref 
href="impala_file_formats.xml#file_formats"/> for comparisons of all
+        file formats supported by Impala, and <xref 
href="impala_parquet.xml#parquet"/> for details about the
+        Parquet file format.
+      </p>
+
+      <note>
+        For smaller volumes of data, a few gigabytes or less for each table or 
partition, you might not see
+        significant performance differences between file formats. At small 
data volumes, reduced I/O from an
+        efficient compressed file format can be counterbalanced by reduced 
opportunity for parallel execution. When
+        planning for a production deployment or conducting benchmarks, always 
use realistic data volumes to get a
+        true picture of performance and scalability.
+      </note>
+    </section>
+
+    <section id="perf_cookbook_small_files">
+
+      <title>Avoid data ingestion processes that produce many small 
files.</title>
+
+      <p>
+        When producing data files outside of Impala, prefer either text format 
or Avro, where you can build up the
+        files row by row. Once the data is in Impala, you can convert it to 
the more efficient Parquet format and
+        split into multiple data files using a single <codeph>INSERT ... 
SELECT</codeph> statement. Or, if you have
+        the infrastructure to produce multi-megabyte Parquet files as part of 
your data preparation process, do
+        that and skip the conversion step inside Impala.
+      </p>
+
+      <p>
+        Always use <codeph>INSERT ... SELECT</codeph> to copy significant 
volumes of data from table to table
+        within Impala. Avoid <codeph>INSERT ... VALUES</codeph> for any 
substantial volume of data or
+        performance-critical tables, because each such statement produces a 
separate tiny data file. See
+        <xref href="impala_insert.xml#insert"/> for examples of the 
<codeph>INSERT ... SELECT</codeph> syntax.
+      </p>
+
+      <p>
+        For example, if you have thousands of partitions in a Parquet table, 
each with less than
+        <ph rev="parquet_block_size">256 MB</ph> of data, consider 
partitioning in a less granular way, such as by
+        year / month rather than year / month / day. If an inefficient data 
ingestion process produces thousands of
+        data files in the same table or partition, consider compacting the 
data by performing an <codeph>INSERT ...
+        SELECT</codeph> to copy all the data to a different table; the data 
will be reorganized into a smaller
+        number of larger files by this process.
+      </p>
+    </section>
+
+    <section id="perf_cookbook_partitioning">
+
+      <title>Choose partitioning granularity based on actual data 
volume.</title>
+
+      <p>
+        Partitioning is a technique that physically divides the data based on 
values of one or more columns, such
+        as by year, month, day, region, city, section of a web site, and so 
on. When you issue queries that request
+        a specific value or range of values for the partition key columns, 
Impala can avoid reading the irrelevant
+        data, potentially yielding a huge savings in disk I/O.
+      </p>
+
+      <p>
+        When deciding which column(s) to use for partitioning, choose the 
right level of granularity. For example,
+        should you partition by year, month, and day, or only by year and 
month? Choose a partitioning strategy
+        that puts at least <ph rev="parquet_block_size">256 MB</ph> of data in 
each partition, to take advantage of
+        HDFS bulk I/O and Impala distributed queries.
+      </p>
+
+      <p>
+        Over-partitioning can also cause query planning to take longer than 
necessary, as Impala prunes the
+        unnecessary partitions. Ideally, keep the number of partitions in the 
table under 30 thousand.
+      </p>
+
+      <p>
+        When preparing data files to go in a partition directory, create 
several large files rather than many small
+        ones. If you receive data in the form of many small files and have no 
control over the input format,
+        consider using the <codeph>INSERT ... SELECT</codeph> syntax to copy 
data from one table or partition to
+        another, which compacts the files into a relatively small number 
(based on the number of nodes in the
+        cluster).
+      </p>
+
+      <p>
+        If you need to reduce the overall number of partitions and increase 
the amount of data in each partition,
+        first look for partition key columns that are rarely referenced or are 
referenced in non-critical queries
+        (not subject to an SLA). For example, your web site log data might be 
partitioned by year, month, day, and
+        hour, but if most queries roll up the results by day, perhaps you only 
need to partition by year, month,
+        and day.
+      </p>
+
+      <p>
+        If you need to reduce the granularity even more, consider creating 
<q>buckets</q>, computed values
+        corresponding to different sets of partition key values. For example, 
you can use the
+        <codeph>TRUNC()</codeph> function with a <codeph>TIMESTAMP</codeph> 
column to group date and time values
+        based on intervals such as week or quarter. See
+        <xref href="impala_datetime_functions.xml#datetime_functions"/> for 
details.
+      </p>
+
+      <p>
+        See <xref href="impala_partitioning.xml#partitioning"/> for full 
details and performance considerations for
+        partitioning.
+      </p>
+    </section>
+
+    <section id="perf_cookbook_partition_keys">
+
+      <title>Use smallest appropriate integer types for partition key 
columns.</title>
+
+      <p>
+        Although it is tempting to use strings for partition key columns, 
since those values are turned into HDFS
+        directory names anyway, you can minimize memory usage by using numeric 
values for common partition key
+        fields such as <codeph>YEAR</codeph>, <codeph>MONTH</codeph>, and 
<codeph>DAY</codeph>. Use the smallest
+        integer type that holds the appropriate range of values, typically 
<codeph>TINYINT</codeph> for
+        <codeph>MONTH</codeph> and <codeph>DAY</codeph>, and 
<codeph>SMALLINT</codeph> for <codeph>YEAR</codeph>.
+        Use the <codeph>EXTRACT()</codeph> function to pull out individual 
date and time fields from a
+        <codeph>TIMESTAMP</codeph> value, and <codeph>CAST()</codeph> the 
return value to the appropriate integer
+        type.
+      </p>
+    </section>
+
+    <section id="perf_cookbook_parquet_block_size">
+
+      <title>Choose an appropriate Parquet block size.</title>
+
+      <p rev="parquet_block_size">
+        By default, the Impala <codeph>INSERT ... SELECT</codeph> statement 
creates Parquet files with a 256 MB
+        block size. (This default was changed in Impala 2.0. Formerly, the 
limit was 1 GB, but Impala made
+        conservative estimates about compression, resulting in files that were 
smaller than 1 GB.)
+      </p>
+
+      <p>
+        Each Parquet file written by Impala is a single block, allowing the 
whole file to be processed as a unit by a single host.
+        As you copy Parquet files into HDFS or between HDFS filesystems, use 
<codeph>hdfs dfs -pb</codeph> to preserve the original
+        block size.
+      </p>
+
+      <p>
+        If there is only one or a few data block in your Parquet table, or in 
a partition that is the only one
+        accessed by a query, then you might experience a slowdown for a 
different reason: not enough data to take
+        advantage of Impala's parallel distributed queries. Each data block is 
processed by a single core on one of
+        the DataNodes. In a 100-node cluster of 16-core machines, you could 
potentially process thousands of data
+        files simultaneously. You want to find a sweet spot between <q>many 
tiny files</q> and <q>single giant
+        file</q> that balances bulk I/O and parallel processing. You can set 
the <codeph>PARQUET_FILE_SIZE</codeph>
+        query option before doing an <codeph>INSERT ... SELECT</codeph> 
statement to reduce the size of each
+        generated Parquet file. <ph rev="2.0.0">(Specify the file size as an 
absolute number of bytes, or in Impala
+        2.0 and later, in units ending with <codeph>m</codeph> for megabytes 
or <codeph>g</codeph> for
+        gigabytes.)</ph> Run benchmarks with different file sizes to find the 
right balance point for your
+        particular data volume.
+      </p>
+    </section>
+
+    <section id="perf_cookbook_stats">
+
+      <title>Gather statistics for all tables used in performance-critical or 
high-volume join queries.</title>
+
+      <p>
+        Gather the statistics with the <codeph>COMPUTE STATS</codeph> 
statement. See
+        <xref href="impala_perf_joins.xml#perf_joins"/> for details.
+      </p>
+    </section>
+
+    <section id="perf_cookbook_network">
+
+      <title>Minimize the overhead of transmitting results back to the 
client.</title>
+
+      <p>
+        Use techniques such as:
+      </p>
+
+      <ul>
+        <li>
+          Aggregation. If you need to know how many rows match a condition, 
the total values of matching values
+          from some column, the lowest or highest matching value, and so on, 
call aggregate functions such as
+          <codeph>COUNT()</codeph>, <codeph>SUM()</codeph>, and 
<codeph>MAX()</codeph> in the query rather than
+          sending the result set to an application and doing those 
computations there. Remember that the size of an
+          unaggregated result set could be huge, requiring substantial time to 
transmit across the network.
+        </li>
+
+        <li>
+          Filtering. Use all applicable tests in the <codeph>WHERE</codeph> 
clause of a query to eliminate rows
+          that are not relevant, rather than producing a big result set and 
filtering it using application logic.
+        </li>
+
+        <li>
+          <codeph>LIMIT</codeph> clause. If you only need to see a few sample 
values from a result set, or the top
+          or bottom values from a query using <codeph>ORDER BY</codeph>, 
include the <codeph>LIMIT</codeph> clause
+          to reduce the size of the result set rather than asking for the full 
result set and then throwing most of
+          the rows away.
+        </li>
+
+        <li>
+          Avoid overhead from pretty-printing the result set and displaying it 
on the screen. When you retrieve the
+          results through <cmdname>impala-shell</cmdname>, use 
<cmdname>impala-shell</cmdname> options such as
+          <codeph>-B</codeph> and <codeph>--output_delimiter</codeph> to 
produce results without special
+          formatting, and redirect output to a file rather than printing to 
the screen. Consider using
+          <codeph>INSERT ... SELECT</codeph> to write the results directly to 
new files in HDFS. See
+          <xref href="impala_shell_options.xml#shell_options"/> for details 
about the
+          <cmdname>impala-shell</cmdname> command-line options.
+        </li>
+      </ul>
+    </section>
+
+    <section id="perf_cookbook_explain">
+
+      <title>Verify that your queries are planned in an efficient logical 
manner.</title>
+
+      <p>
+        Examine the <codeph>EXPLAIN</codeph> plan for a query before actually 
running it. See
+        <xref href="impala_explain.xml#explain"/> and <xref 
href="impala_explain_plan.xml#perf_explain"/> for
+        details.
+      </p>
+    </section>
+
+    <section id="perf_cookbook_profile">
+
+      <title>Verify performance characteristics of queries.</title>
+
+      <p>
+        Verify that the low-level aspects of I/O, memory usage, network 
bandwidth, CPU utilization, and so on are
+        within expected ranges by examining the query profile for a query 
after running it. See
+        <xref href="impala_explain_plan.xml#perf_profile"/> for details.
+      </p>
+    </section>
+
+    <section id="perf_cookbook_os">
+
+      <title>Use appropriate operating system settings.</title>
+
+      <p>
+        See <xref 
href="http://www.cloudera.com/content/www/en-us/documentation/enterprise/latest/topics/cdh_admin_performance.html";
 scope="external" format="html">Optimizing Performance in CDH</xref>
+        for recommendations about operating system
+        settings that you can change to influence Impala performance. In 
particular, you might find
+        that changing the <codeph>vm.swappiness</codeph> Linux kernel setting 
to a non-zero value improves
+        overall performance.
+      </p>
+    </section>
+
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/1fcc8cee/docs/topics/impala_perf_resources.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_perf_resources.xml 
b/docs/topics/impala_perf_resources.xml
new file mode 100644
index 0000000..e00c6de
--- /dev/null
+++ b/docs/topics/impala_perf_resources.xml
@@ -0,0 +1,60 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="mem_limits">
+
+  <title>Controlling Impala Resource Usage</title>
+  <titlealts audience="PDF"><navtitle>Controlling Resource 
Usage</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Performance"/>
+      <data name="Category" value="Memory"/>
+      <data name="Category" value="Scalability"/>
+      <data name="Category" value="Resource Management"/>
+      <data name="Category" value="Administrators"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      Sometimes, balancing raw query performance against scalability requires 
limiting the amount of resources,
+      such as memory or CPU, used by a single query or group of queries. 
Impala can use several mechanisms that
+      help to smooth out the load during heavy concurrent usage, resulting in 
faster overall query times and
+      sharing of resources across Impala queries, MapReduce jobs, and other 
kinds of workloads across a CDH
+      cluster:
+    </p>
+
+    <ul>
+      <li>
+        The Impala admission control feature uses a fast, distributed 
mechanism to hold back queries that exceed
+        limits on the number of concurrent queries or the amount of memory 
used. The queries are queued, and
+        executed as other queries finish and resources become available. You 
can control the concurrency limits,
+        and specify different limits for different groups of users to divide 
cluster resources according to the
+        priorities of different classes of users. This feature is new in 
Impala 1.3, and works with both CDH 4 and
+        CDH 5. See <xref href="impala_admission.xml#admission_control"/> for 
details.
+      </li>
+
+      <li>
+        <p>
+          You can restrict the amount of memory Impala reserves during query 
execution by specifying the
+          <codeph>-mem_limit</codeph> option for the <codeph>impalad</codeph> 
daemon. See
+          <xref href="impala_config_options.xml#config_options"/> for details. 
This limit applies only to the
+          memory that is directly consumed by queries; Impala reserves 
additional memory at startup, for example to
+          hold cached metadata.
+        </p>
+      </li>
+
+      <li>
+        <p>
+          For production deployment, Cloudera recommends that you implement 
resource isolation using mechanisms
+          such as cgroups, which you can configure using Cloudera Manager. For 
details, see the
+          <xref 
href="http://www.cloudera.com/documentation/enterprise/latest/topics/cm_mc_service_pools.html";
 scope="external" format="html">Static
+          Resource Pools</xref> in the Cloudera Manager documentation.
+        </p>
+      </li>
+    </ul>
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/1fcc8cee/docs/topics/impala_perf_skew.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_perf_skew.xml b/docs/topics/impala_perf_skew.xml
new file mode 100644
index 0000000..b3a7cec
--- /dev/null
+++ b/docs/topics/impala_perf_skew.xml
@@ -0,0 +1,150 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="perf_skew">
+
+  <title>Detecting and Correcting HDFS Block Skew Conditions</title>
+  <titlealts audience="PDF"><navtitle>HDFS Block Skew</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Performance"/>
+      <data name="Category" value="HDFS"/>
+      <data name="Category" value="Proof of Concept"/>
+      <data name="Category" value="Administrators"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      For best performance of Impala parallel queries, the work is divided 
equally across hosts in the cluster, and
+      all hosts take approximately equal time to finish their work. If one 
host takes substantially longer than
+      others, the extra time needed for the slow host can become the dominant 
factor in query performance.
+      Therefore, one of the first steps in performance tuning for Impala is to 
detect and correct such conditions.
+    </p>
+
+    <p>
+      The main cause of uneven performance that you can correct within Impala 
is <term>skew</term> in the number of
+      HDFS data blocks processed by each host, where some hosts process 
substantially more data blocks than others.
+      This condition can occur because of uneven distribution of the data 
values themselves, for example causing
+      certain data files or partitions to be large while others are very 
small. (Although it is possible to have
+      unevenly distributed data without any problems with the distribution of 
HDFS blocks.) Block skew could also
+      be due to the underlying block allocation policies within HDFS, the 
replication factor of the data files, and
+      the way that Impala chooses the host to process each data block.
+    </p>
+
+    <p>
+      The most convenient way to detect block skew, or slow-host issues in 
general, is to examine the <q>executive
+      summary</q> information from the query profile after running a query:
+    </p>
+
+    <ul>
+      <li>
+        <p>
+          In <cmdname>impala-shell</cmdname>, issue the 
<codeph>SUMMARY</codeph> command immediately after the
+          query is complete, to see just the summary information. If you 
detect issues involving skew, you might
+          switch to issuing the <codeph>PROFILE</codeph> command, which 
displays the summary information followed
+          by a detailed performance analysis.
+        </p>
+      </li>
+
+      <li>
+        <p>
+          In the Cloudera Manager interface or the Impala debug web UI, click 
on the <uicontrol>Profile</uicontrol>
+          link associated with the query after it is complete. The executive 
summary information is displayed early
+          in the profile output.
+        </p>
+      </li>
+    </ul>
+
+    <p>
+      For each phase of the query, you see an <uicontrol>Avg Time</uicontrol> 
and a <uicontrol>Max Time</uicontrol>
+      value, along with <uicontrol>#Hosts</uicontrol> indicating how many 
hosts are involved in that query phase.
+      For all the phases with <uicontrol>#Hosts</uicontrol> greater than one, 
look for cases where the maximum time
+      is substantially greater than the average time. Focus on the phases that 
took the longest, for example, those
+      taking multiple seconds rather than milliseconds or microseconds.
+    </p>
+
+    <p>
+      If you detect that some hosts take longer than others, first rule out 
non-Impala causes. One reason that some
+      hosts could be slower than others is if those hosts have less capacity 
than the others, or if they are
+      substantially busier due to unevenly distributed non-Impala workloads:
+    </p>
+
+    <ul>
+      <li>
+        <p>
+          For clusters running Impala, keep the relative capacities of all 
hosts roughly equal. Any cost savings
+          from including some underpowered hosts in the cluster will likely be 
outweighed by poor or uneven
+          performance, and the time spent diagnosing performance issues.
+        </p>
+      </li>
+
+      <li>
+        <p>
+          If non-Impala workloads cause slowdowns on some hosts but not 
others, use the appropriate load-balancing
+          techniques for the non-Impala components to smooth out the load 
across the cluster.
+        </p>
+      </li>
+    </ul>
+
+    <p>
+      If the hosts on your cluster are evenly powered and evenly loaded, 
examine the detailed profile output to
+      determine which host is taking longer than others for the query phase in 
question. Examine how many bytes are
+      processed during that phase on that host, how much memory is used, and 
how many bytes are transmitted across
+      the network.
+    </p>
+
+    <p>
+      The most common symptom is a higher number of bytes read on one host 
than others, due to one host being
+      requested to process a higher number of HDFS data blocks. This condition 
is more likely to occur when the
+      number of blocks accessed by the query is relatively small. For example, 
if you have a 10-node cluster and
+      the query processes 10 HDFS blocks, each node might not process exactly 
one block. If one node sits idle
+      while another node processes two blocks, the query could take twice as 
long as if the data was perfectly
+      distributed.
+    </p>
+
+    <p>
+      Possible solutions in this case include:
+    </p>
+
+    <ul>
+      <li>
+        <p>
+          If the query is artificially small, perhaps for benchmarking 
purposes, scale it up to process a larger
+          data set. For example, if some nodes read 10 HDFS data blocks while 
others read 11, the overall effect of
+          the uneven distribution is much lower than when some nodes did twice 
as much work as others. As a
+          guideline, aim for a <q>sweet spot</q> where each node reads 2 GB or 
more from HDFS per query. Queries
+          that process lower volumes than that could experience inconsistent 
performance that smooths out as
+          queries become more data-intensive.
+        </p>
+      </li>
+
+      <li>
+        <p>
+          If the query processes only a few large blocks, so that many nodes 
sit idle and cannot help to
+          parallelize the query, consider reducing the overall block size. For 
example, you might adjust the
+          <codeph>PARQUET_FILE_SIZE</codeph> query option before copying or 
converting data into a Parquet table.
+          Or you might adjust the granularity of data files produced earlier 
in the ETL pipeline by non-Impala
+          components. In Impala 2.0 and later, the default Parquet block size 
is 256 MB, reduced from 1 GB, to
+          improve parallelism for common cluster sizes and data volumes.
+        </p>
+      </li>
+
+      <li>
+        <p>
+          Reduce the amount of compression applied to the data. For text data 
files, the highest degree of
+          compression (gzip) produces unsplittable files that are more 
difficult for Impala to process in parallel,
+          and require extra memory during processing to hold the compressed 
and uncompressed data simultaneously.
+          For binary formats such as Parquet and Avro, compression can result 
in fewer data blocks overall, but
+          remember that when queries process relatively few blocks, there is 
less opportunity for parallel
+          execution and many nodes in the cluster might sit idle. Note that 
when Impala writes Parquet data with
+          the query option <codeph>COMPRESSION_CODEC=NONE</codeph> enabled, 
the data is still typically compact due
+          to the encoding schemes used by Parquet, independent of the final 
compression step.
+        </p>
+      </li>
+    </ul>
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/1fcc8cee/docs/topics/impala_perf_testing.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_perf_testing.xml 
b/docs/topics/impala_perf_testing.xml
new file mode 100644
index 0000000..d621556
--- /dev/null
+++ b/docs/topics/impala_perf_testing.xml
@@ -0,0 +1,175 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="performance_testing">
+
+  <title>Testing Impala Performance</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Performance"/>
+      <data name="Category" value="Troubleshooting"/>
+      <data name="Category" value="Proof of Concept"/>
+      <data name="Category" value="Logs"/>
+      <data name="Category" value="Administrators"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+      <!-- Should reorg this topic to use nested topics, not sections. Some 
keywords like 'logs' buried in section titles. -->
+      <data name="Category" value="Sectionated Pages"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      Test to ensure that Impala is configured for optimal performance. If you 
have installed Impala without
+      Cloudera Manager, complete the processes described in this topic to help 
ensure a proper configuration. Even
+      if you installed Impala with Cloudera Manager, which automatically 
applies appropriate configurations, these
+      procedures can be used to verify that Impala is set up correctly.
+    </p>
+
+    <section id="checking_config_performance">
+
+      <title>Checking Impala Configuration Values</title>
+
+      <p>
+        You can inspect Impala configuration values by connecting to your 
Impala server using a browser.
+      </p>
+
+      <p>
+        <b>To check Impala configuration values:</b>
+      </p>
+
+      <ol>
+        <li>
+          Use a browser to connect to one of the hosts running 
<codeph>impalad</codeph> in your environment.
+          Connect using an address of the form
+          
<codeph>http://<varname>hostname</varname>:<varname>port</varname>/varz</codeph>.
+          <note>
+            In the preceding example, replace <codeph>hostname</codeph> and 
<codeph>port</codeph> with the name and
+            port of your Impala server. The default port is 25000.
+          </note>
+        </li>
+
+        <li>
+          Review the configured values.
+          <p>
+            For example, to check that your system is configured to use block 
locality tracking information, you
+            would check that the value for 
<codeph>dfs.datanode.hdfs-blocks-metadata.enabled</codeph> is
+            <codeph>true</codeph>.
+          </p>
+        </li>
+      </ol>
+
+      <p id="p_31">
+        <b>To check data locality:</b>
+      </p>
+
+      <ol>
+        <li>
+          Execute a query on a dataset that is available across multiple 
nodes. For example, for a table named
+          <codeph>MyTable</codeph> that has a reasonable chance of being 
spread across multiple DataNodes:
+<codeblock>[impalad-host:21000] &gt; SELECT COUNT (*) FROM MyTable</codeblock>
+        </li>
+
+        <li>
+          After the query completes, review the contents of the Impala logs. 
You should find a recent message
+          similar to the following:
+<codeblock>Total remote scan volume = 0</codeblock>
+        </li>
+      </ol>
+
+      <p>
+        The presence of remote scans may indicate <codeph>impalad</codeph> is 
not running on the correct nodes.
+        This can be because some DataNodes do not have 
<codeph>impalad</codeph> running or it can be because the
+        <codeph>impalad</codeph> instance that is starting the query is unable 
to contact one or more of the
+        <codeph>impalad</codeph> instances.
+      </p>
+
+      <p>
+        <b>To understand the causes of this issue:</b>
+      </p>
+
+      <ol>
+        <li>
+          Connect to the debugging web server. By default, this server runs on 
port 25000. This page lists all
+          <codeph>impalad</codeph> instances running in your cluster. If there 
are fewer instances than you expect,
+          this often indicates some DataNodes are not running 
<codeph>impalad</codeph>. Ensure
+          <codeph>impalad</codeph> is started on all DataNodes.
+        </li>
+
+        <li>
+          <!-- To do:
+            There are other references to this tip about the "Impala daemon's 
hostname" elsewhere. Could reconcile, conref, or link.
+          -->
+          If you are using multi-homed hosts, ensure that the Impala daemon's 
hostname resolves to the interface on
+          which <codeph>impalad</codeph> is running. The hostname Impala is 
using is displayed when
+          <codeph>impalad</codeph> starts. To explicitly set the hostname, use 
the <codeph>--hostname</codeph>Â flag.
+        </li>
+
+        <li>
+          Check that <codeph>statestored</codeph> is running as expected. 
Review the contents of the state store
+          log to ensure all instances of <codeph>impalad</codeph> are listed 
as having connected to the state
+          store.
+        </li>
+      </ol>
+    </section>
+
+    <section id="checking_config_logs">
+
+      <title>Reviewing Impala Logs</title>
+
+      <p>
+        You can review the contents of the Impala logs for signs that 
short-circuit reads or block location
+        tracking are not functioning. Before checking logs, execute a simple 
query against a small HDFS dataset.
+        Completing a query task generates log messages using current settings. 
Information on starting Impala and
+        executing queries can be found in <xref 
href="impala_processes.xml#processes"/> and
+        <xref href="impala_impala_shell.xml#impala_shell"/>. Information on 
logging can be found in
+        <xref href="impala_logging.xml#logging"/>. Log messages and their 
interpretations are as follows:
+      </p>
+
+      <table>
+        <tgroup cols="2">
+          <colspec colname="1" colwidth="30*"/>
+          <colspec colname="2" colwidth="10*"/>
+          <thead>
+            <row>
+              <entry>
+                Log Message
+              </entry>
+              <entry>
+                Interpretation
+              </entry>
+            </row>
+          </thead>
+          <tbody>
+            <row>
+              <entry>
+                <p>
+<pre>Unknown disk id. This will negatively affect performance. Check your hdfs 
settings to enable block location metadata
+</pre>
+                </p>
+              </entry>
+              <entry>
+                <p>
+                  Tracking block locality is not enabled.
+                </p>
+              </entry>
+            </row>
+            <row>
+              <entry>
+                <p>
+<pre>Unable to load native-hadoop library for your platform... using 
builtin-java classes where applicable</pre>
+                </p>
+              </entry>
+              <entry>
+                <p>
+                  Native checksumming is not enabled.
+                </p>
+              </entry>
+            </row>
+          </tbody>
+        </tgroup>
+      </table>
+    </section>
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/1fcc8cee/docs/topics/impala_planning.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_planning.xml b/docs/topics/impala_planning.xml
new file mode 100644
index 0000000..f103ab8
--- /dev/null
+++ b/docs/topics/impala_planning.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="planning">
+
+  <title>Planning for Impala Deployment</title>
+  <titlealts audience="PDF"><navtitle>Deployment 
Planning</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Deploying"/>
+      <data name="Category" value="Planning"/>
+      <data name="Category" value="Proof of Concept"/>
+      <data name="Category" value="Administrators"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Stub Pages"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      <indexterm audience="Cloudera">planning</indexterm>
+      Before you set up Impala in production, do some planning to make sure 
that your hardware setup has sufficient
+      capacity, that your cluster topology is optimal for Impala queries, and 
that your schema design and ETL
+      processes follow the best practices for Impala.
+    </p>
+
+    <p outputclass="toc"/>
+  </conbody>
+</concept>

[4/7] incubator-impala git commit: New files needed to make PDF build happy.

Reply via email to