[42/44] incubator-joshua-site git commit: First attempt

mjpost Fri, 08 Apr 2016 20:11:23 -0700

http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/features.html
----------------------------------------------------------------------
diff --git a/4.0/features.html b/4.0/features.html
new file mode 100644
index 0000000..49dcecc
--- /dev/null
+++ b/4.0/features.html
@@ -0,0 +1,257 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";>
+
+<html xmlns="http://www.w3.org/1999/xhtml"; xml:lang="en" lang="en">
+  <head>
+    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+    <link rel="stylesheet" type="text/css" media="screen,print" 
href="../joshua4.css" />
+    <title>Joshua | Features</title>
+  </head>
+
+  <body>
+
+    <div id="navbar">
+      <a href="http://joshua-decoder.org/";>
+        <img src="../images/joshua-logo-small.png" width="130px" 
+             alt="Joshua logo (picture of a Joshua tree)" />
+      </a>
+
+      <p class="infobox">
+        <b>Stable version</b><br />
+        4.1<br/><br/>
+        <b>Release date</b><br />
+        2013 January
+      </p>
+
+<!--       <div class="infobox"> -->
+<!--         <b>AUTO LINKS</b><br/> -->
+<!--         <ul> -->
+<!--            -->
+<!--           <li> Advanced features</li> -->
+<!--            -->
+<!--           <li> Advanced features</li> -->
+<!--            -->
+<!--           <li> Advanced features</li> -->
+<!--            -->
+<!--           <li> Building a language pack</li> -->
+<!--            -->
+<!--           <li> Building a language pack</li> -->
+<!--            -->
+<!--           <li> Bundling a configuration</li> -->
+<!--            -->
+<!--           <li> Contributors</li> -->
+<!--            -->
+<!--           <li> Decoder configuration parameters</li> -->
+<!--            -->
+<!--           <li> Decoder configuration parameters</li> -->
+<!--            -->
+<!--           <li> Decoder configuration parameters</li> -->
+<!--            -->
+<!--           <li> Decoder configuration parameters</li> -->
+<!--            -->
+<!--           <li> Frequently Asked Questions</li> -->
+<!--            -->
+<!--           <li> Common problems</li> -->
+<!--            -->
+<!--           <li> Frequently Asked Questions</li> -->
+<!--            -->
+<!--           <li> Common problems</li> -->
+<!--            -->
+<!--           <li> Features</li> -->
+<!--            -->
+<!--           <li> Features</li> -->
+<!--            -->
+<!--           <li> Features</li> -->
+<!--            -->
+<!--           <li> Features</li> -->
+<!--            -->
+<!--           <li> Joshua file formats</li> -->
+<!--            -->
+<!--           <li> Joshua file formats</li> -->
+<!--            -->
+<!--           <li> Joshua file formats</li> -->
+<!--            -->
+<!--           <li> Joshua file formats</li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--           <li> Fisher and CALLHOME Spanish English Speech Translation 
Corpus</li> -->
+<!--            -->
+<!--           <li> Indian Languages Parallel Corpora</li> -->
+<!--            -->
+<!--           <li> Joshua 4.0 User Documentation</li> -->
+<!--            -->
+<!--           <li> Language packs</li> -->
+<!--            -->
+<!--           <li> Paraphrase Packs</li> -->
+<!--            -->
+<!--           <li> Joshua releases</li> -->
+<!--            -->
+<!--           <li> Support</li> -->
+<!--            -->
+<!--           <li> Getting Started</li> -->
+<!--            -->
+<!--           <li> Welcome to Joshua</li> -->
+<!--            -->
+<!--           <li> Joshua documentation</li> -->
+<!--            -->
+<!--           <li> Joshua documentation</li> -->
+<!--            -->
+<!--           <li> Installation</li> -->
+<!--            -->
+<!--           <li> Installation</li> -->
+<!--            -->
+<!--           <li> Alignment with Jacana</li> -->
+<!--            -->
+<!--           <li> Alignment with Jacana</li> -->
+<!--            -->
+<!--           <li> Alignment with Jacana</li> -->
+<!--            -->
+<!--           <li> Building large LMs with SRILM</li> -->
+<!--            -->
+<!--           <li> Building large LMs with SRILM</li> -->
+<!--            -->
+<!--           <li> Building large LMs with SRILM</li> -->
+<!--            -->
+<!--           <li> Building large LMs with SRILM</li> -->
+<!--            -->
+<!--           <li> Lattice decoding</li> -->
+<!--            -->
+<!--           <li> Grammar Packing</li> -->
+<!--            -->
+<!--           <li> Grammar Packing</li> -->
+<!--            -->
+<!--           <li> Grammar Packing</li> -->
+<!--            -->
+<!--           <li> Grammar Packing</li> -->
+<!--            -->
+<!--           <li> The Joshua Pipeline</li> -->
+<!--            -->
+<!--           <li> The Joshua Pipeline</li> -->
+<!--            -->
+<!--           <li> The Joshua Pipeline</li> -->
+<!--            -->
+<!--           <li> The Joshua Pipeline</li> -->
+<!--            -->
+<!--           <li> Quick Start</li> -->
+<!--            -->
+<!--           <li> Quick Start</li> -->
+<!--            -->
+<!--           <li> Releases</li> -->
+<!--            -->
+<!--           <li> Server mode</li> -->
+<!--            -->
+<!--           <li> Server mode</li> -->
+<!--            -->
+<!--           <li> Server mode</li> -->
+<!--            -->
+<!--           <li> Installing and running the Joshua Decoder</li> -->
+<!--            -->
+<!--           <li> Grammar extraction with Thrax</li> -->
+<!--            -->
+<!--           <li> Grammar extraction with Thrax</li> -->
+<!--            -->
+<!--           <li> Grammar extraction with Thrax</li> -->
+<!--            -->
+<!--           <li> Grammar extraction with Thrax</li> -->
+<!--            -->
+<!--           <li> Building Translation Models</li> -->
+<!--            -->
+<!--           <li> Building Translation Models</li> -->
+<!--            -->
+<!--           <li> Building Translation Models</li> -->
+<!--            -->
+<!--           <li> Building Translation Models</li> -->
+<!--            -->
+<!--           <li> Pipeline tutorial</li> -->
+<!--            -->
+<!--           <li> Pipeline tutorial</li> -->
+<!--            -->
+<!--           <li> Pipeline tutorial</li> -->
+<!--            -->
+<!--           <li> What's New</li> -->
+<!--            -->
+<!--           <li> What's New</li> -->
+<!--            -->
+<!--           <li> Z-MERT</li> -->
+<!--            -->
+<!--           <li> Z-MERT</li> -->
+<!--            -->
+<!--           <li> Z-MERT</li> -->
+<!--            -->
+<!--           <li> Z-MERT</li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--         </ul> -->
+<!--       </div>   -->
+
+      <div class="infobox">
+
+        <b>Links</b><br />
+        <ul>
+          <li> <a href="../index.html">Main</a> </li>
+          <li> <a href="pipeline.html">Pipeline</a> </li>
+          <li> <a href="step-by-step-instructions.html">Manual walkthrough</a> 
</li>
+          <li> <a href="decoder.html">Decoder</a> </li>
+          <li> <a href="server.html">Decoder Server</a> </li>
+          <li> <a href="file-formats.html">File formats</a> </li>
+          <li> <a href="thrax.html">Grammar Extraction</a> </li>
+          <li> <a href="../releases.html">Releases</a> </li>
+        </ul>
+      </div>
+
+      <div class="infobox">
+        <b>Advanced</b><br />
+        <ul>
+<!--          <li> <a href="packing.html">Grammar packing</a> </li> -->
+          <li> <a href="large-lms.html">Building large LMs</a> </li>
+          <li> <a href="zmert.html">Running Z-MERT</a> </li>
+          <li> <a href="lattice.html">Lattices</a> </li>
+          <li> <a href="server.html">TCP/IP server</a> </li>
+          <li> <a href="bundle.html">Bundled configuration</a> </li>
+        </ul>
+      </div>
+
+      <div class="infobox">
+        <b>Help</b><br />
+        <ul>
+          <li> <a href="faq.html">Answers</a> </li>
+          <li> <a 
href="https://groups.google.com/d/forum/joshua_support";>Archive</a> </li>
+        </ul>
+      </div>
+
+      <div class="footer">
+        Last updated on April 08, 2016
+      </div>
+
+    </div>
+
+    <div id="main">
+      <div id="title">
+        <h1>Features</h1>
+      </div>
+
+      <div id="content">
+        
+        <p>This file will contain information about the Joshua decoder 
features.</p>
+
+
+      </div>
+    </div>
+
+  </body>
+</html>
+
+
+
+
+


http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/features.md
----------------------------------------------------------------------
diff --git a/4.0/features.md b/4.0/features.md
deleted file mode 100644
index d915c82..0000000
--- a/4.0/features.md
+++ /dev/null
@@ -1,7 +0,0 @@
----
-layout: default4
-category: links
-title: Features
----
-
-This file will contain information about the Joshua decoder features.

http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/file-formats.html
----------------------------------------------------------------------
diff --git a/4.0/file-formats.html b/4.0/file-formats.html
new file mode 100644
index 0000000..73770c6
--- /dev/null
+++ b/4.0/file-formats.html
@@ -0,0 +1,341 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";>
+
+<html xmlns="http://www.w3.org/1999/xhtml"; xml:lang="en" lang="en">
+  <head>
+    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+    <link rel="stylesheet" type="text/css" media="screen,print" 
href="../joshua4.css" />
+    <title>Joshua | Joshua file formats</title>
+  </head>
+
+  <body>
+
+    <div id="navbar">
+      <a href="http://joshua-decoder.org/";>
+        <img src="../images/joshua-logo-small.png" width="130px" 
+             alt="Joshua logo (picture of a Joshua tree)" />
+      </a>
+
+      <p class="infobox">
+        <b>Stable version</b><br />
+        4.1<br/><br/>
+        <b>Release date</b><br />
+        2013 January
+      </p>
+
+<!--       <div class="infobox"> -->
+<!--         <b>AUTO LINKS</b><br/> -->
+<!--         <ul> -->
+<!--            -->
+<!--           <li> Advanced features</li> -->
+<!--            -->
+<!--           <li> Advanced features</li> -->
+<!--            -->
+<!--           <li> Advanced features</li> -->
+<!--            -->
+<!--           <li> Building a language pack</li> -->
+<!--            -->
+<!--           <li> Building a language pack</li> -->
+<!--            -->
+<!--           <li> Bundling a configuration</li> -->
+<!--            -->
+<!--           <li> Contributors</li> -->
+<!--            -->
+<!--           <li> Decoder configuration parameters</li> -->
+<!--            -->
+<!--           <li> Decoder configuration parameters</li> -->
+<!--            -->
+<!--           <li> Decoder configuration parameters</li> -->
+<!--            -->
+<!--           <li> Decoder configuration parameters</li> -->
+<!--            -->
+<!--           <li> Frequently Asked Questions</li> -->
+<!--            -->
+<!--           <li> Common problems</li> -->
+<!--            -->
+<!--           <li> Frequently Asked Questions</li> -->
+<!--            -->
+<!--           <li> Common problems</li> -->
+<!--            -->
+<!--           <li> Features</li> -->
+<!--            -->
+<!--           <li> Features</li> -->
+<!--            -->
+<!--           <li> Features</li> -->
+<!--            -->
+<!--           <li> Features</li> -->
+<!--            -->
+<!--           <li> Joshua file formats</li> -->
+<!--            -->
+<!--           <li> Joshua file formats</li> -->
+<!--            -->
+<!--           <li> Joshua file formats</li> -->
+<!--            -->
+<!--           <li> Joshua file formats</li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--           <li> Fisher and CALLHOME Spanish English Speech Translation 
Corpus</li> -->
+<!--            -->
+<!--           <li> Indian Languages Parallel Corpora</li> -->
+<!--            -->
+<!--           <li> Joshua 4.0 User Documentation</li> -->
+<!--            -->
+<!--           <li> Language packs</li> -->
+<!--            -->
+<!--           <li> Paraphrase Packs</li> -->
+<!--            -->
+<!--           <li> Joshua releases</li> -->
+<!--            -->
+<!--           <li> Support</li> -->
+<!--            -->
+<!--           <li> Getting Started</li> -->
+<!--            -->
+<!--           <li> Welcome to Joshua</li> -->
+<!--            -->
+<!--           <li> Joshua documentation</li> -->
+<!--            -->
+<!--           <li> Joshua documentation</li> -->
+<!--            -->
+<!--           <li> Installation</li> -->
+<!--            -->
+<!--           <li> Installation</li> -->
+<!--            -->
+<!--           <li> Alignment with Jacana</li> -->
+<!--            -->
+<!--           <li> Alignment with Jacana</li> -->
+<!--            -->
+<!--           <li> Alignment with Jacana</li> -->
+<!--            -->
+<!--           <li> Building large LMs with SRILM</li> -->
+<!--            -->
+<!--           <li> Building large LMs with SRILM</li> -->
+<!--            -->
+<!--           <li> Building large LMs with SRILM</li> -->
+<!--            -->
+<!--           <li> Building large LMs with SRILM</li> -->
+<!--            -->
+<!--           <li> Lattice decoding</li> -->
+<!--            -->
+<!--           <li> Grammar Packing</li> -->
+<!--            -->
+<!--           <li> Grammar Packing</li> -->
+<!--            -->
+<!--           <li> Grammar Packing</li> -->
+<!--            -->
+<!--           <li> Grammar Packing</li> -->
+<!--            -->
+<!--           <li> The Joshua Pipeline</li> -->
+<!--            -->
+<!--           <li> The Joshua Pipeline</li> -->
+<!--            -->
+<!--           <li> The Joshua Pipeline</li> -->
+<!--            -->
+<!--           <li> The Joshua Pipeline</li> -->
+<!--            -->
+<!--           <li> Quick Start</li> -->
+<!--            -->
+<!--           <li> Quick Start</li> -->
+<!--            -->
+<!--           <li> Releases</li> -->
+<!--            -->
+<!--           <li> Server mode</li> -->
+<!--            -->
+<!--           <li> Server mode</li> -->
+<!--            -->
+<!--           <li> Server mode</li> -->
+<!--            -->
+<!--           <li> Installing and running the Joshua Decoder</li> -->
+<!--            -->
+<!--           <li> Grammar extraction with Thrax</li> -->
+<!--            -->
+<!--           <li> Grammar extraction with Thrax</li> -->
+<!--            -->
+<!--           <li> Grammar extraction with Thrax</li> -->
+<!--            -->
+<!--           <li> Grammar extraction with Thrax</li> -->
+<!--            -->
+<!--           <li> Building Translation Models</li> -->
+<!--            -->
+<!--           <li> Building Translation Models</li> -->
+<!--            -->
+<!--           <li> Building Translation Models</li> -->
+<!--            -->
+<!--           <li> Building Translation Models</li> -->
+<!--            -->
+<!--           <li> Pipeline tutorial</li> -->
+<!--            -->
+<!--           <li> Pipeline tutorial</li> -->
+<!--            -->
+<!--           <li> Pipeline tutorial</li> -->
+<!--            -->
+<!--           <li> What's New</li> -->
+<!--            -->
+<!--           <li> What's New</li> -->
+<!--            -->
+<!--           <li> Z-MERT</li> -->
+<!--            -->
+<!--           <li> Z-MERT</li> -->
+<!--            -->
+<!--           <li> Z-MERT</li> -->
+<!--            -->
+<!--           <li> Z-MERT</li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--         </ul> -->
+<!--       </div>   -->
+
+      <div class="infobox">
+
+        <b>Links</b><br />
+        <ul>
+          <li> <a href="../index.html">Main</a> </li>
+          <li> <a href="pipeline.html">Pipeline</a> </li>
+          <li> <a href="step-by-step-instructions.html">Manual walkthrough</a> 
</li>
+          <li> <a href="decoder.html">Decoder</a> </li>
+          <li> <a href="server.html">Decoder Server</a> </li>
+          <li> <a href="file-formats.html">File formats</a> </li>
+          <li> <a href="thrax.html">Grammar Extraction</a> </li>
+          <li> <a href="../releases.html">Releases</a> </li>
+        </ul>
+      </div>
+
+      <div class="infobox">
+        <b>Advanced</b><br />
+        <ul>
+<!--          <li> <a href="packing.html">Grammar packing</a> </li> -->
+          <li> <a href="large-lms.html">Building large LMs</a> </li>
+          <li> <a href="zmert.html">Running Z-MERT</a> </li>
+          <li> <a href="lattice.html">Lattices</a> </li>
+          <li> <a href="server.html">TCP/IP server</a> </li>
+          <li> <a href="bundle.html">Bundled configuration</a> </li>
+        </ul>
+      </div>
+
+      <div class="infobox">
+        <b>Help</b><br />
+        <ul>
+          <li> <a href="faq.html">Answers</a> </li>
+          <li> <a 
href="https://groups.google.com/d/forum/joshua_support";>Archive</a> </li>
+        </ul>
+      </div>
+
+      <div class="footer">
+        Last updated on April 08, 2016
+      </div>
+
+    </div>
+
+    <div id="main">
+      <div id="title">
+        <h1>Joshua file formats</h1>
+      </div>
+
+      <div id="content">
+        
+        <p>This page describes the formats of Joshua configuration and support 
files.</p>
+
+<h2 id="translation-models-grammars">Translation models (grammars)</h2>
+
+<p>Joshua supports three grammar file formats.</p>
+
+<ol>
+  <li>Thrax / Hiero</li>
+  <li>SAMT [deprecated]</li>
+  <li>packed</li>
+</ol>
+
+<p>The <em>Hiero</em> format is not restricted to Hiero grammars, but simply 
means <em>the format that David
+Chiang developed for Hiero</em>.  It can support a much broader class of SCFGs 
containing an arbitrary
+set of nonterminals.  Similarly, the <em>SAMT</em> format is not restricted to 
SAMT grammars but instead
+simply denotes <em>the grammar format that Zollmann and Venugopal developed 
for their decoder</em>.  To
+remove this source of confusion, âthraxâ is the preferred format 
designation, and is in fact the
+default.</p>
+
+<p>The packed grammar format is the efficient grammar representation developed 
by
+<a href="http://cs.jhu.edu/~juri";>Juri Ganitkevich</a> <a 
href="packing.html">is described in detail elsewhere</a>.</p>
+
+<p>Grammar rules in the Thrax format follow this format:</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>[LHS] ||| 
SOURCE-SIDE ||| TARGET-SIDE ||| FEATURES
+</code></pre>
+</div>
+
+<p>Here are some two examples, one for a Hiero grammar, and the other for an 
SAMT grammar:</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>[X] ||| el chico 
[X] ||| the boy [X] ||| -3.14 0 2 17
+[S] ||| el chico [VP] ||| the boy [VP] ||| -3.14 0 2 17
+</code></pre>
+</div>
+
+<p>The feature values can have optional labels, e.g.:</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>[X] ||| el chico 
[X] ||| the boy [X] ||| lexprob=-3.14 abstract=0 numwords=2 count=17
+</code></pre>
+</div>
+
+<p>These feature names are made up.  For an actual list of feature names, 
please
+<a href="thrax.html">see the Thrax documentation</a>.</p>
+
+<p>The SAMT grammar format is deprecated and undocumented.</p>
+
+<h2 id="language-model">Language Model</h2>
+
+<p>Joshua has three language model implementations: <a href="">KenLM</a>, <a 
href="">BerkeleyLM</a>, and an (unrecommended)
+dummy Java implementation.  All language model implementations support the 
standard ARPA format
+output by <a href="">SRILM</a>.  In addition, KenLM and BerkeleyLM support 
compiled formats that can be loaded
+more quickly and efficiently.</p>
+
+<h3 id="compiling-for-kenlm">Compiling for KenLM</h3>
+
+<p>To compile an ARPA grammar for KenLM, use the (provided) <code 
class="highlighter-rouge">build-binary</code> command, located deep within
+the Joshua source code:</p>
+
+<div class="highlighter-rouge"><pre 
class="highlight"><code>$JOSHUA/src/joshua/decoder/ff/lm/kenlm/build_binary 
lm.arpa lm.kenlm
+</code></pre>
+</div>
+
+<p>This script takes the <code class="highlighter-rouge">lm.arpa</code> file 
and produces the compiled version in <code 
class="highlighter-rouge">lm.kenlm</code>.</p>
+
+<h3 id="compiling-for-berkeleylm">Compiling for BerkeleyLM</h3>
+
+<p>To compile a grammar for BerkeleyLM, type:</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>java -cp 
$JOSHUA/lib/berkeleylm.jar -server -mxMEM 
edu.berkeley.nlp.lm.io.MakeLmBinaryFromArpa lm.arpa lm.berkeleylm
+</code></pre>
+</div>
+
+<p>The <code class="highlighter-rouge">lm.berkeleylm</code> file can then be 
listed directly in the <a href="decoder.html">Joshua configuration file</a>.</p>
+
+<h2 id="joshua-configuration">Joshua configuration</h2>
+
+<p>See <a href="decoder.html">the decoder page</a>.</p>
+
+<h2 id="pipeline-configuration">Pipeline configuration</h2>
+
+<p>See <a href="pipeline.html">the pipeline page</a>.</p>
+
+<h2 id="thrax-configuration">Thrax configuration</h2>
+
+<p>See <a href="thrax.html">the thrax page</a>.</p>
+
+
+      </div>
+    </div>
+
+  </body>
+</html>
+
+
+
+
+

http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/file-formats.md
----------------------------------------------------------------------
diff --git a/4.0/file-formats.md b/4.0/file-formats.md
deleted file mode 100644
index c10f906..0000000
--- a/4.0/file-formats.md
+++ /dev/null
@@ -1,78 +0,0 @@
----
-layout: default4
-category: advanced
-title: Joshua file formats
----
-This page describes the formats of Joshua configuration and support files.
-
-## Translation models (grammars)
-
-Joshua supports three grammar file formats.
-
-1. Thrax / Hiero
-1. SAMT [deprecated]
-1. packed
-
-The *Hiero* format is not restricted to Hiero grammars, but simply means *the 
format that David
-Chiang developed for Hiero*.  It can support a much broader class of SCFGs 
containing an arbitrary
-set of nonterminals.  Similarly, the *SAMT* format is not restricted to SAMT 
grammars but instead
-simply denotes *the grammar format that Zollmann and Venugopal developed for 
their decoder*.  To
-remove this source of confusion, "thrax" is the preferred format designation, 
and is in fact the
-default.
-
-The packed grammar format is the efficient grammar representation developed by
-[Juri Ganitkevich](http://cs.jhu.edu/~juri) [is described in detail 
elsewhere](packing.html).
-
-Grammar rules in the Thrax format follow this format:
-
-    [LHS] ||| SOURCE-SIDE ||| TARGET-SIDE ||| FEATURES
-    
-Here are some two examples, one for a Hiero grammar, and the other for an SAMT 
grammar:
-
-    [X] ||| el chico [X] ||| the boy [X] ||| -3.14 0 2 17
-    [S] ||| el chico [VP] ||| the boy [VP] ||| -3.14 0 2 17
-    
-The feature values can have optional labels, e.g.:
-
-    [X] ||| el chico [X] ||| the boy [X] ||| lexprob=-3.14 abstract=0 
numwords=2 count=17
-    
-These feature names are made up.  For an actual list of feature names, please
-[see the Thrax documentation](thrax.html).
-
-The SAMT grammar format is deprecated and undocumented.
-
-## Language Model
-
-Joshua has three language model implementations: [KenLM](), [BerkeleyLM](), 
and an (unrecommended)
-dummy Java implementation.  All language model implementations support the 
standard ARPA format
-output by [SRILM]().  In addition, KenLM and BerkeleyLM support compiled 
formats that can be loaded
-more quickly and efficiently.
-
-### Compiling for KenLM
-
-To compile an ARPA grammar for KenLM, use the (provided) `build-binary` 
command, located deep within
-the Joshua source code:
-
-    $JOSHUA/src/joshua/decoder/ff/lm/kenlm/build_binary lm.arpa lm.kenlm
-    
-This script takes the `lm.arpa` file and produces the compiled version in 
`lm.kenlm`.
-
-### Compiling for BerkeleyLM
-
-To compile a grammar for BerkeleyLM, type:
-
-    java -cp $JOSHUA/lib/berkeleylm.jar -server -mxMEM 
edu.berkeley.nlp.lm.io.MakeLmBinaryFromArpa lm.arpa lm.berkeleylm
-
-The `lm.berkeleylm` file can then be listed directly in the [Joshua 
configuration file](decoder.html).
-
-## Joshua configuration
-
-See [the decoder page](decoder.html).
-
-## Pipeline configuration
-
-See [the pipeline page](pipeline.html).
-
-## Thrax configuration
-
-See [the thrax page](thrax.html).

http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/index.html
----------------------------------------------------------------------
diff --git a/4.0/index.html b/4.0/index.html
new file mode 100644
index 0000000..216b006
--- /dev/null
+++ b/4.0/index.html
@@ -0,0 +1,309 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";>
+
+<html xmlns="http://www.w3.org/1999/xhtml"; xml:lang="en" lang="en">
+  <head>
+    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+    <link rel="stylesheet" type="text/css" media="screen,print" 
href="../joshua4.css" />
+    <title>Joshua | Joshua 4.0 User Documentation</title>
+  </head>
+
+  <body>
+
+    <div id="navbar">
+      <a href="http://joshua-decoder.org/";>
+        <img src="../images/joshua-logo-small.png" width="130px" 
+             alt="Joshua logo (picture of a Joshua tree)" />
+      </a>
+
+      <p class="infobox">
+        <b>Stable version</b><br />
+        4.1<br/><br/>
+        <b>Release date</b><br />
+        2013 January
+      </p>
+
+<!--       <div class="infobox"> -->
+<!--         <b>AUTO LINKS</b><br/> -->
+<!--         <ul> -->
+<!--            -->
+<!--           <li> Advanced features</li> -->
+<!--            -->
+<!--           <li> Advanced features</li> -->
+<!--            -->
+<!--           <li> Advanced features</li> -->
+<!--            -->
+<!--           <li> Building a language pack</li> -->
+<!--            -->
+<!--           <li> Building a language pack</li> -->
+<!--            -->
+<!--           <li> Bundling a configuration</li> -->
+<!--            -->
+<!--           <li> Contributors</li> -->
+<!--            -->
+<!--           <li> Decoder configuration parameters</li> -->
+<!--            -->
+<!--           <li> Decoder configuration parameters</li> -->
+<!--            -->
+<!--           <li> Decoder configuration parameters</li> -->
+<!--            -->
+<!--           <li> Decoder configuration parameters</li> -->
+<!--            -->
+<!--           <li> Frequently Asked Questions</li> -->
+<!--            -->
+<!--           <li> Common problems</li> -->
+<!--            -->
+<!--           <li> Frequently Asked Questions</li> -->
+<!--            -->
+<!--           <li> Common problems</li> -->
+<!--            -->
+<!--           <li> Features</li> -->
+<!--            -->
+<!--           <li> Features</li> -->
+<!--            -->
+<!--           <li> Features</li> -->
+<!--            -->
+<!--           <li> Features</li> -->
+<!--            -->
+<!--           <li> Joshua file formats</li> -->
+<!--            -->
+<!--           <li> Joshua file formats</li> -->
+<!--            -->
+<!--           <li> Joshua file formats</li> -->
+<!--            -->
+<!--           <li> Joshua file formats</li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--           <li> Fisher and CALLHOME Spanish English Speech Translation 
Corpus</li> -->
+<!--            -->
+<!--           <li> Indian Languages Parallel Corpora</li> -->
+<!--            -->
+<!--           <li> Joshua 4.0 User Documentation</li> -->
+<!--            -->
+<!--           <li> Language packs</li> -->
+<!--            -->
+<!--           <li> Paraphrase Packs</li> -->
+<!--            -->
+<!--           <li> Joshua releases</li> -->
+<!--            -->
+<!--           <li> Support</li> -->
+<!--            -->
+<!--           <li> Getting Started</li> -->
+<!--            -->
+<!--           <li> Welcome to Joshua</li> -->
+<!--            -->
+<!--           <li> Joshua documentation</li> -->
+<!--            -->
+<!--           <li> Joshua documentation</li> -->
+<!--            -->
+<!--           <li> Installation</li> -->
+<!--            -->
+<!--           <li> Installation</li> -->
+<!--            -->
+<!--           <li> Alignment with Jacana</li> -->
+<!--            -->
+<!--           <li> Alignment with Jacana</li> -->
+<!--            -->
+<!--           <li> Alignment with Jacana</li> -->
+<!--            -->
+<!--           <li> Building large LMs with SRILM</li> -->
+<!--            -->
+<!--           <li> Building large LMs with SRILM</li> -->
+<!--            -->
+<!--           <li> Building large LMs with SRILM</li> -->
+<!--            -->
+<!--           <li> Building large LMs with SRILM</li> -->
+<!--            -->
+<!--           <li> Lattice decoding</li> -->
+<!--            -->
+<!--           <li> Grammar Packing</li> -->
+<!--            -->
+<!--           <li> Grammar Packing</li> -->
+<!--            -->
+<!--           <li> Grammar Packing</li> -->
+<!--            -->
+<!--           <li> Grammar Packing</li> -->
+<!--            -->
+<!--           <li> The Joshua Pipeline</li> -->
+<!--            -->
+<!--           <li> The Joshua Pipeline</li> -->
+<!--            -->
+<!--           <li> The Joshua Pipeline</li> -->
+<!--            -->
+<!--           <li> The Joshua Pipeline</li> -->
+<!--            -->
+<!--           <li> Quick Start</li> -->
+<!--            -->
+<!--           <li> Quick Start</li> -->
+<!--            -->
+<!--           <li> Releases</li> -->
+<!--            -->
+<!--           <li> Server mode</li> -->
+<!--            -->
+<!--           <li> Server mode</li> -->
+<!--            -->
+<!--           <li> Server mode</li> -->
+<!--            -->
+<!--           <li> Installing and running the Joshua Decoder</li> -->
+<!--            -->
+<!--           <li> Grammar extraction with Thrax</li> -->
+<!--            -->
+<!--           <li> Grammar extraction with Thrax</li> -->
+<!--            -->
+<!--           <li> Grammar extraction with Thrax</li> -->
+<!--            -->
+<!--           <li> Grammar extraction with Thrax</li> -->
+<!--            -->
+<!--           <li> Building Translation Models</li> -->
+<!--            -->
+<!--           <li> Building Translation Models</li> -->
+<!--            -->
+<!--           <li> Building Translation Models</li> -->
+<!--            -->
+<!--           <li> Building Translation Models</li> -->
+<!--            -->
+<!--           <li> Pipeline tutorial</li> -->
+<!--            -->
+<!--           <li> Pipeline tutorial</li> -->
+<!--            -->
+<!--           <li> Pipeline tutorial</li> -->
+<!--            -->
+<!--           <li> What's New</li> -->
+<!--            -->
+<!--           <li> What's New</li> -->
+<!--            -->
+<!--           <li> Z-MERT</li> -->
+<!--            -->
+<!--           <li> Z-MERT</li> -->
+<!--            -->
+<!--           <li> Z-MERT</li> -->
+<!--            -->
+<!--           <li> Z-MERT</li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--         </ul> -->
+<!--       </div>   -->
+
+      <div class="infobox">
+
+        <b>Links</b><br />
+        <ul>
+          <li> <a href="../index.html">Main</a> </li>
+          <li> <a href="pipeline.html">Pipeline</a> </li>
+          <li> <a href="step-by-step-instructions.html">Manual walkthrough</a> 
</li>
+          <li> <a href="decoder.html">Decoder</a> </li>
+          <li> <a href="server.html">Decoder Server</a> </li>
+          <li> <a href="file-formats.html">File formats</a> </li>
+          <li> <a href="thrax.html">Grammar Extraction</a> </li>
+          <li> <a href="../releases.html">Releases</a> </li>
+        </ul>
+      </div>
+
+      <div class="infobox">
+        <b>Advanced</b><br />
+        <ul>
+<!--          <li> <a href="packing.html">Grammar packing</a> </li> -->
+          <li> <a href="large-lms.html">Building large LMs</a> </li>
+          <li> <a href="zmert.html">Running Z-MERT</a> </li>
+          <li> <a href="lattice.html">Lattices</a> </li>
+          <li> <a href="server.html">TCP/IP server</a> </li>
+          <li> <a href="bundle.html">Bundled configuration</a> </li>
+        </ul>
+      </div>
+
+      <div class="infobox">
+        <b>Help</b><br />
+        <ul>
+          <li> <a href="faq.html">Answers</a> </li>
+          <li> <a 
href="https://groups.google.com/d/forum/joshua_support";>Archive</a> </li>
+        </ul>
+      </div>
+
+      <div class="footer">
+        Last updated on April 08, 2016
+      </div>
+
+    </div>
+
+    <div id="main">
+      <div id="title">
+        <h1>Joshua 4.0 User Documentation</h1>
+      </div>
+
+      <div id="content">
+        
+        <p>This page contains end-user oriented documentation for the 4.0 
release of
+<a href="http://joshua-decoder.org/";>the Joshua decoder</a>.</p>
+
+<h2 id="download-and-setup">Download and Setup</h2>
+
+<ol>
+  <li>
+    <p>Follow <a href="http://cs.jhu.edu/~post/files/joshua-4.0.tgz";>this 
link</a> to download Joshua, or do it
+from the command line:</p>
+
+    <div class="highlighter-rouge"><pre class="highlight"><code>wget -q 
http://cs.jhu.edu/~post/files/joshua-4.0.tgz
+</code></pre>
+    </div>
+  </li>
+  <li>
+    <p>Next, unpack it, set the <code class="highlighter-rouge">$JOSHUA</code> 
environment variable, and compile everything:</p>
+
+    <div class="highlighter-rouge"><pre class="highlight"><code>tar xzf 
joshua-4.0.tgz
+cd joshua-4.0
+
+# for bash
+export JOSHUA=$(pwd)
+echo "export JOSHUA=$JOSHUA" &gt;&gt; ~/.bashrc
+
+# for tcsh
+setenv JOSHUA `pwd`
+echo "setenv JOSHUA $JOSHUA" &gt;&gt; ~/.profile
+   
+ant all
+</code></pre>
+    </div>
+  </li>
+  <li>
+    <p>Thatâs it.</p>
+  </li>
+</ol>
+
+<h2 id="quick-start">Quick start</h2>
+
+<p>If you just want to run the complete machine translation pipeline 
(beginning with data preparation,
+through alignment, hierarchical model building, tuning, testing, and 
reporting), we recommend you
+use our <a href="pipeline.html">pipeline script</a>.  You might also be 
interested in
+<a href="http://cs.jhu.edu/~ccb/joshua/";>Chrisâ old walkthrough</a>.</p>
+
+<h2 id="more-information">More information</h2>
+
+<p>For more detail on the decoder itself, including its command-line options, 
see
+<a href="decoder.html">the Joshua decoder page</a>.  You can also learn more 
about other steps of
+<a href="pipeline.html">the Joshua MT pipeline</a>, including <a 
href="thrax.html">grammar extraction</a> with Thrax and
+Joshuaâs <a href="packing.html">efficient grammar representation</a> (new 
with version 4.0).</p>
+
+<p>If you have problems or issues, you might find some help <a 
href="faq.html">on our answers page</a> or
+<a href="https://groups.google.com/forum/?fromgroups#!forum/joshua_support";>in 
the mailing list archives</a>.</p>
+
+
+      </div>
+    </div>
+
+  </body>
+</html>
+
+
+
+
+

http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/index.md
----------------------------------------------------------------------
diff --git a/4.0/index.md b/4.0/index.md
deleted file mode 100644
index ae62e4e..0000000
--- a/4.0/index.md
+++ /dev/null
@@ -1,48 +0,0 @@
----
-layout: default4
-title: Joshua 4.0 User Documentation
----
-
-This page contains end-user oriented documentation for the 4.0 release of
-[the Joshua decoder](http://joshua-decoder.org/).
-
-## Download and Setup
-
-1. Follow [this link](http://cs.jhu.edu/~post/files/joshua-4.0.tgz) to 
download Joshua, or do it
-from the command line:
-
-       wget -q http://cs.jhu.edu/~post/files/joshua-4.0.tgz
-
-2. Next, unpack it, set the `$JOSHUA` environment variable, and compile 
everything:
-
-       tar xzf joshua-4.0.tgz
-       cd joshua-4.0
-
-       # for bash
-       export JOSHUA=$(pwd)
-       echo "export JOSHUA=$JOSHUA" >> ~/.bashrc
-
-       # for tcsh
-       setenv JOSHUA `pwd`
-       echo "setenv JOSHUA $JOSHUA" >> ~/.profile
-       
-       ant all
-
-3. That's it.
-
-## Quick start
-
-If you just want to run the complete machine translation pipeline (beginning 
with data preparation,
-through alignment, hierarchical model building, tuning, testing, and 
reporting), we recommend you
-use our <a href="pipeline.html">pipeline script</a>.  You might also be 
interested in
-[Chris' old walkthrough](http://cs.jhu.edu/~ccb/joshua/).
-
-## More information
-
-For more detail on the decoder itself, including its command-line options, see
-[the Joshua decoder page](decoder.html).  You can also learn more about other 
steps of
-[the Joshua MT pipeline](pipeline.html), including [grammar 
extraction](thrax.html) with Thrax and
-Joshua's [efficient grammar representation](packing.html) (new with version 
4.0).
-
-If you have problems or issues, you might find some help [on our answers 
page](faq.html) or
-[in the mailing list 
archives](https://groups.google.com/forum/?fromgroups#!forum/joshua_support).

http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/large-lms.html
----------------------------------------------------------------------
diff --git a/4.0/large-lms.html b/4.0/large-lms.html
new file mode 100644
index 0000000..d591057
--- /dev/null
+++ b/4.0/large-lms.html
@@ -0,0 +1,455 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";>
+
+<html xmlns="http://www.w3.org/1999/xhtml"; xml:lang="en" lang="en">
+  <head>
+    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+    <link rel="stylesheet" type="text/css" media="screen,print" 
href="../joshua4.css" />
+    <title>Joshua | Building large LMs with SRILM</title>
+  </head>
+
+  <body>
+
+    <div id="navbar">
+      <a href="http://joshua-decoder.org/";>
+        <img src="../images/joshua-logo-small.png" width="130px" 
+             alt="Joshua logo (picture of a Joshua tree)" />
+      </a>
+
+      <p class="infobox">
+        <b>Stable version</b><br />
+        4.1<br/><br/>
+        <b>Release date</b><br />
+        2013 January
+      </p>
+
+<!--       <div class="infobox"> -->
+<!--         <b>AUTO LINKS</b><br/> -->
+<!--         <ul> -->
+<!--            -->
+<!--           <li> Advanced features</li> -->
+<!--            -->
+<!--           <li> Advanced features</li> -->
+<!--            -->
+<!--           <li> Advanced features</li> -->
+<!--            -->
+<!--           <li> Building a language pack</li> -->
+<!--            -->
+<!--           <li> Building a language pack</li> -->
+<!--            -->
+<!--           <li> Bundling a configuration</li> -->
+<!--            -->
+<!--           <li> Contributors</li> -->
+<!--            -->
+<!--           <li> Decoder configuration parameters</li> -->
+<!--            -->
+<!--           <li> Decoder configuration parameters</li> -->
+<!--            -->
+<!--           <li> Decoder configuration parameters</li> -->
+<!--            -->
+<!--           <li> Decoder configuration parameters</li> -->
+<!--            -->
+<!--           <li> Frequently Asked Questions</li> -->
+<!--            -->
+<!--           <li> Common problems</li> -->
+<!--            -->
+<!--           <li> Frequently Asked Questions</li> -->
+<!--            -->
+<!--           <li> Common problems</li> -->
+<!--            -->
+<!--           <li> Features</li> -->
+<!--            -->
+<!--           <li> Features</li> -->
+<!--            -->
+<!--           <li> Features</li> -->
+<!--            -->
+<!--           <li> Features</li> -->
+<!--            -->
+<!--           <li> Joshua file formats</li> -->
+<!--            -->
+<!--           <li> Joshua file formats</li> -->
+<!--            -->
+<!--           <li> Joshua file formats</li> -->
+<!--            -->
+<!--           <li> Joshua file formats</li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--           <li> Fisher and CALLHOME Spanish English Speech Translation 
Corpus</li> -->
+<!--            -->
+<!--           <li> Indian Languages Parallel Corpora</li> -->
+<!--            -->
+<!--           <li> Joshua 4.0 User Documentation</li> -->
+<!--            -->
+<!--           <li> Language packs</li> -->
+<!--            -->
+<!--           <li> Paraphrase Packs</li> -->
+<!--            -->
+<!--           <li> Joshua releases</li> -->
+<!--            -->
+<!--           <li> Support</li> -->
+<!--            -->
+<!--           <li> Getting Started</li> -->
+<!--            -->
+<!--           <li> Welcome to Joshua</li> -->
+<!--            -->
+<!--           <li> Joshua documentation</li> -->
+<!--            -->
+<!--           <li> Joshua documentation</li> -->
+<!--            -->
+<!--           <li> Installation</li> -->
+<!--            -->
+<!--           <li> Installation</li> -->
+<!--            -->
+<!--           <li> Alignment with Jacana</li> -->
+<!--            -->
+<!--           <li> Alignment with Jacana</li> -->
+<!--            -->
+<!--           <li> Alignment with Jacana</li> -->
+<!--            -->
+<!--           <li> Building large LMs with SRILM</li> -->
+<!--            -->
+<!--           <li> Building large LMs with SRILM</li> -->
+<!--            -->
+<!--           <li> Building large LMs with SRILM</li> -->
+<!--            -->
+<!--           <li> Building large LMs with SRILM</li> -->
+<!--            -->
+<!--           <li> Lattice decoding</li> -->
+<!--            -->
+<!--           <li> Grammar Packing</li> -->
+<!--            -->
+<!--           <li> Grammar Packing</li> -->
+<!--            -->
+<!--           <li> Grammar Packing</li> -->
+<!--            -->
+<!--           <li> Grammar Packing</li> -->
+<!--            -->
+<!--           <li> The Joshua Pipeline</li> -->
+<!--            -->
+<!--           <li> The Joshua Pipeline</li> -->
+<!--            -->
+<!--           <li> The Joshua Pipeline</li> -->
+<!--            -->
+<!--           <li> The Joshua Pipeline</li> -->
+<!--            -->
+<!--           <li> Quick Start</li> -->
+<!--            -->
+<!--           <li> Quick Start</li> -->
+<!--            -->
+<!--           <li> Releases</li> -->
+<!--            -->
+<!--           <li> Server mode</li> -->
+<!--            -->
+<!--           <li> Server mode</li> -->
+<!--            -->
+<!--           <li> Server mode</li> -->
+<!--            -->
+<!--           <li> Installing and running the Joshua Decoder</li> -->
+<!--            -->
+<!--           <li> Grammar extraction with Thrax</li> -->
+<!--            -->
+<!--           <li> Grammar extraction with Thrax</li> -->
+<!--            -->
+<!--           <li> Grammar extraction with Thrax</li> -->
+<!--            -->
+<!--           <li> Grammar extraction with Thrax</li> -->
+<!--            -->
+<!--           <li> Building Translation Models</li> -->
+<!--            -->
+<!--           <li> Building Translation Models</li> -->
+<!--            -->
+<!--           <li> Building Translation Models</li> -->
+<!--            -->
+<!--           <li> Building Translation Models</li> -->
+<!--            -->
+<!--           <li> Pipeline tutorial</li> -->
+<!--            -->
+<!--           <li> Pipeline tutorial</li> -->
+<!--            -->
+<!--           <li> Pipeline tutorial</li> -->
+<!--            -->
+<!--           <li> What's New</li> -->
+<!--            -->
+<!--           <li> What's New</li> -->
+<!--            -->
+<!--           <li> Z-MERT</li> -->
+<!--            -->
+<!--           <li> Z-MERT</li> -->
+<!--            -->
+<!--           <li> Z-MERT</li> -->
+<!--            -->
+<!--           <li> Z-MERT</li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--         </ul> -->
+<!--       </div>   -->
+
+      <div class="infobox">
+
+        <b>Links</b><br />
+        <ul>
+          <li> <a href="../index.html">Main</a> </li>
+          <li> <a href="pipeline.html">Pipeline</a> </li>
+          <li> <a href="step-by-step-instructions.html">Manual walkthrough</a> 
</li>
+          <li> <a href="decoder.html">Decoder</a> </li>
+          <li> <a href="server.html">Decoder Server</a> </li>
+          <li> <a href="file-formats.html">File formats</a> </li>
+          <li> <a href="thrax.html">Grammar Extraction</a> </li>
+          <li> <a href="../releases.html">Releases</a> </li>
+        </ul>
+      </div>
+
+      <div class="infobox">
+        <b>Advanced</b><br />
+        <ul>
+<!--          <li> <a href="packing.html">Grammar packing</a> </li> -->
+          <li> <a href="large-lms.html">Building large LMs</a> </li>
+          <li> <a href="zmert.html">Running Z-MERT</a> </li>
+          <li> <a href="lattice.html">Lattices</a> </li>
+          <li> <a href="server.html">TCP/IP server</a> </li>
+          <li> <a href="bundle.html">Bundled configuration</a> </li>
+        </ul>
+      </div>
+
+      <div class="infobox">
+        <b>Help</b><br />
+        <ul>
+          <li> <a href="faq.html">Answers</a> </li>
+          <li> <a 
href="https://groups.google.com/d/forum/joshua_support";>Archive</a> </li>
+        </ul>
+      </div>
+
+      <div class="footer">
+        Last updated on April 08, 2016
+      </div>
+
+    </div>
+
+    <div id="main">
+      <div id="title">
+        <h1>Building large LMs with SRILM</h1>
+      </div>
+
+      <div id="content">
+        
+        <p>The following is a tutorial for building a large language model 
from the
+English Gigaword Fifth Edition corpus
+<a 
href="http://www.ldc.upenn.edu/Catalog/catalogEntry.jsp?catalogId=LDC2011T07";>LDC2011T07</a>
+using SRILM. English text is provided from seven different sources.</p>
+
+<h3 id="step-0-clean-up-the-corpus">Step 0: Clean up the corpus</h3>
+
+<p>The Gigaword corpus has to be stripped of all SGML tags and tokenized.
+Instructions for performing those steps are not included in this
+documentation. A description of this process can be found in a paper
+called <a 
href="https://akbcwekex2012.files.wordpress.com/2012/05/28_paper.pdf";>âAnnotated
+Gigawordâ</a>.</p>
+
+<p>The Joshua package ships with a script that converts all alphabetical
+characters to their lowercase equivalent. The script is located at
+<code class="highlighter-rouge">$JOSHUA/scripts/lowercase.perl</code>.</p>
+
+<p>Make a directory structure as follows:</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>gigaword/
+âââ corpus/
+âÂ Â  âââ afp_eng/
+âÂ Â  âÂ Â  âââ afp_eng_199405.lc.gz
+âÂ Â  âÂ Â  âââ afp_eng_199406.lc.gz
+âÂ Â  âÂ Â  âââ ...
+âÂ Â  âÂ Â  âââ counts/
+âÂ Â  âââ apw_eng/
+âÂ Â  âÂ Â  âââ apw_eng_199411.lc.gz
+âÂ Â  âÂ Â  âââ apw_eng_199412.lc.gz
+âÂ Â  âÂ Â  âââ ...
+âÂ Â  âÂ Â  âââ counts/
+âÂ Â  âââ cna_eng/
+âÂ Â  âÂ Â  âââ ...
+âÂ Â  âÂ Â  âââ counts/
+âÂ Â  âââ ltw_eng/
+âÂ Â  âÂ Â  âââ ...
+âÂ Â  âÂ Â  âââ counts/
+âÂ Â  âââ nyt_eng/
+âÂ Â  âÂ Â  âââ ...
+âÂ Â  âÂ Â  âââ counts/
+âÂ Â  âââ wpb_eng/
+âÂ Â  âÂ Â  âââ ...
+âÂ Â  âÂ Â  âââ counts/
+âÂ Â  âââ xin_eng/
+âÂ Â   Â Â  âââ ...
+âÂ Â   Â Â  âââ counts/
+âââ lm/
+ Â Â  âââ afp_eng/
+ Â Â  âââ apw_eng/
+ Â Â  âââ cna_eng/
+ Â Â  âââ ltw_eng/
+ Â Â  âââ nyt_eng/
+ Â Â  âââ wpb_eng/
+ Â Â  âââ xin_eng/
+</code></pre>
+</div>
+
+<p>The next step will be to build smaller LMs and then interpolate them into 
one
+file.</p>
+
+<h3 id="step-1-count-ngrams">Step 1: Count ngrams</h3>
+
+<p>Run the following script once from each source directory under the <code 
class="highlighter-rouge">corpus/</code>
+directory (edit it to specify the path to the <code 
class="highlighter-rouge">ngram-count</code> binary as well as
+the number of processors):</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code><span 
class="c">#!/bin/sh</span>
+
+<span class="nv">NGRAM_COUNT</span><span class="o">=</span><span 
class="nv">$SRILM_SRC</span>/bin/i686-m64/ngram-count
+<span class="nv">args</span><span class="o">=</span><span class="s2">""</span>
+
+<span class="k">for </span><span class="nb">source </span><span 
class="k">in</span> <span class="k">*</span>.gz; <span class="k">do
+   </span><span class="nv">args</span><span class="o">=</span><span 
class="nv">$args</span><span class="s2">"-sort -order 5 -text </span><span 
class="nv">$source</span><span class="s2"> -write counts/</span><span 
class="nv">$source</span><span class="s2">-counts.gz "</span>
+<span class="k">done
+
+</span><span class="nb">echo</span> <span class="nv">$args</span> | xargs 
--max-procs<span class="o">=</span>4 -n 7 <span class="nv">$NGRAM_COUNT</span>
+</code></pre>
+</div>
+
+<p>Then move each <code class="highlighter-rouge">counts/</code> directory to 
the corresponding directory under
+<code class="highlighter-rouge">lm/</code>. Now that each ngram has been 
counted, we can make a language
+model for each of the seven sources.</p>
+
+<h3 id="step-2-make-individual-language-models">Step 2: Make individual 
language models</h3>
+
+<p>SRILM includes a script, called <code 
class="highlighter-rouge">make-big-lm</code>, for building large language
+models under resource-limited environments. The manual for this script can be
+read online
+<a 
href="http://www-speech.sri.com/projects/srilm/manpages/training-scripts.1.html";>here</a>.
+Since the Gigaword corpus is so large, it is convenient to use <code 
class="highlighter-rouge">make-big-lm</code>
+even in environments with many parallel processors and a lot of memory.</p>
+
+<p>Initiate the following script from each of the source directories under the
+<code class="highlighter-rouge">lm/</code> directory (edit it to specify the 
path to the <code class="highlighter-rouge">make-big-lm</code> script as
+well as the pruning threshold):</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code><span 
class="c">#!/bin/bash</span>
+<span class="nb">set</span> -x
+
+<span class="nv">CMD</span><span class="o">=</span><span 
class="nv">$SRILM_SRC</span>/bin/make-big-lm
+<span class="nv">PRUNE_THRESHOLD</span><span class="o">=</span>1e-8
+
+<span class="nv">$CMD</span> <span class="se">\</span>
+  -name gigalm <span class="sb">`</span><span class="k">for </span>k <span 
class="k">in </span>counts/<span class="k">*</span>.gz; <span class="k">do 
</span><span class="nb">echo</span> <span class="s2">" </span><span 
class="se">\</span><span class="s2">
+  -read </span><span class="nv">$k</span><span class="s2"> "</span>; <span 
class="k">done</span><span class="sb">`</span> <span class="se">\</span>
+  -lm lm.gz <span class="se">\</span>
+  -max-per-file 100000000 <span class="se">\</span>
+  -order 5 <span class="se">\</span>
+  -kndiscount <span class="se">\</span>
+  -interpolate <span class="se">\</span>
+  -unk <span class="se">\</span>
+  -prune <span class="nv">$PRUNE_THRESHOLD</span>
+</code></pre>
+</div>
+
+<p>The language model attributes chosen are the following:</p>
+
+<ul>
+  <li>N-grams up to order 5</li>
+  <li>Kneser-Ney smoothing</li>
+  <li>N-gram probability estimates at the specified order <em>n</em> are 
interpolated with
+lower-order estimates</li>
+  <li>include the unknown-word token as a regular word</li>
+  <li>pruning N-grams based on the specified threshold</li>
+</ul>
+
+<p>Next, we will mix the models together into a single file.</p>
+
+<h3 id="step-3-mix-models-together">Step 3: Mix models together</h3>
+
+<p>Using development text, interpolation weights can determined that give 
highest
+weight to the source language models that have the lowest perplexity on the
+specified development set.</p>
+
+<h4 id="step-3-1-determine-interpolation-weights">Step 3-1: Determine 
interpolation weights</h4>
+
+<p>Initiate the following script from the <code 
class="highlighter-rouge">lm/</code> directory (edit it to specify the
+path to the <code class="highlighter-rouge">ngram</code> binary as well as the 
path to the development text file):</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code><span 
class="c">#!/bin/bash</span>
+<span class="nb">set</span> -x
+
+<span class="nv">NGRAM</span><span class="o">=</span><span 
class="nv">$SRILM_SRC</span>/bin/i686-m64/ngram
+<span class="nv">DEV_TEXT</span><span 
class="o">=</span>~mpost/expts/wmt12/runs/es-en/data/tune/tune.tok.lc.es
+
+<span class="nb">dirs</span><span class="o">=(</span> afp_eng apw_eng cna_eng 
ltw_eng nyt_eng wpb_eng xin_eng <span class="o">)</span>
+
+<span class="k">for </span>d <span class="k">in</span> <span 
class="k">${</span><span class="nv">dirs</span><span class="p">[@]</span><span 
class="k">}</span> ; <span class="k">do</span>
+  <span class="nv">$NGRAM</span> -debug 2 -order 5 -unk -lm <span 
class="nv">$d</span>/lm.gz -ppl <span class="nv">$DEV_TEXT</span> &gt; <span 
class="nv">$d</span>/lm.ppl ;
+<span class="k">done
+
+</span>compute-best-mix <span class="k">*</span>/lm.ppl &gt; best-mix.ppl
+</code></pre>
+</div>
+
+<p>Take a look at the contents of <code 
class="highlighter-rouge">best-mix.ppl</code>. It will contain a sequence of
+values in parenthesis. These are the interpolation weights of the source
+language models in the order specified. Copy and paste the values within the
+parenthesis into the script below.</p>
+
+<h4 id="step-3-2-combine-the-models">Step 3-2: Combine the models</h4>
+
+<p>Initiate the following script from the <code 
class="highlighter-rouge">lm/</code> directory (edit it to specify the
+path to the <code class="highlighter-rouge">ngram</code> binary as well as the 
interpolation weights):</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code><span 
class="c">#!/bin/bash</span>
+<span class="nb">set</span> -x
+
+<span class="nv">NGRAM</span><span class="o">=</span><span 
class="nv">$SRILM_SRC</span>/bin/i686-m64/ngram
+<span class="nv">DIRS</span><span class="o">=(</span>   afp_eng    apw_eng     
cna_eng  ltw_eng   nyt_eng  wpb_eng  xin_eng <span class="o">)</span>
+<span class="nv">LAMBDAS</span><span class="o">=(</span>0.00631272 0.000647602 
0.251555 0.0134726 0.348953 0.371566 0.00749238<span class="o">)</span>
+
+<span class="nv">$NGRAM</span> -order 5 -unk <span class="se">\</span>
+  -lm      <span class="k">${</span><span class="nv">DIRS</span><span 
class="p">[0]</span><span class="k">}</span>/lm.gz     -lambda  <span 
class="k">${</span><span class="nv">LAMBDAS</span><span 
class="p">[0]</span><span class="k">}</span> <span class="se">\</span>
+  -mix-lm  <span class="k">${</span><span class="nv">DIRS</span><span 
class="p">[1]</span><span class="k">}</span>/lm.gz <span class="se">\</span>
+  -mix-lm2 <span class="k">${</span><span class="nv">DIRS</span><span 
class="p">[2]</span><span class="k">}</span>/lm.gz -mix-lambda2 <span 
class="k">${</span><span class="nv">LAMBDAS</span><span 
class="p">[2]</span><span class="k">}</span> <span class="se">\</span>
+  -mix-lm3 <span class="k">${</span><span class="nv">DIRS</span><span 
class="p">[3]</span><span class="k">}</span>/lm.gz -mix-lambda3 <span 
class="k">${</span><span class="nv">LAMBDAS</span><span 
class="p">[3]</span><span class="k">}</span> <span class="se">\</span>
+  -mix-lm4 <span class="k">${</span><span class="nv">DIRS</span><span 
class="p">[4]</span><span class="k">}</span>/lm.gz -mix-lambda4 <span 
class="k">${</span><span class="nv">LAMBDAS</span><span 
class="p">[4]</span><span class="k">}</span> <span class="se">\</span>
+  -mix-lm5 <span class="k">${</span><span class="nv">DIRS</span><span 
class="p">[5]</span><span class="k">}</span>/lm.gz -mix-lambda5 <span 
class="k">${</span><span class="nv">LAMBDAS</span><span 
class="p">[5]</span><span class="k">}</span> <span class="se">\</span>
+  -mix-lm6 <span class="k">${</span><span class="nv">DIRS</span><span 
class="p">[6]</span><span class="k">}</span>/lm.gz -mix-lambda6 <span 
class="k">${</span><span class="nv">LAMBDAS</span><span 
class="p">[6]</span><span class="k">}</span> <span class="se">\</span>
+  -write-lm mixed_lm.gz
+</code></pre>
+</div>
+
+<p>The resulting file, <code class="highlighter-rouge">mixed_lm.gz</code> is a 
language model based on all the text in
+the Gigaword corpus and with some probabilities biased to the development text
+specify in step 3-1. It is in the ARPA format. The optional next step converts
+it into KenLM format.</p>
+
+<h4 id="step-3-3-convert-to-kenlm">Step 3-3: Convert to KenLM</h4>
+
+<p>The KenLM format has some speed advantages over the ARPA format. Issuing the
+following command will write a new language model file <code 
class="highlighter-rouge">mixed_lm-kenlm.gz</code> that
+is the <code class="highlighter-rouge">mixed_lm.gz</code> language model 
transformed into the KenLM format.</p>
+
+<div class="highlighter-rouge"><pre 
class="highlight"><code>$JOSHUA/src/joshua/decoder/ff/lm/kenlm/build_binary 
mixed_lm.gz mixed_lm.kenlm
+</code></pre>
+</div>
+
+
+
+      </div>
+    </div>
+
+  </body>
+</html>
+
+
+
+
+

http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/large-lms.md
----------------------------------------------------------------------
diff --git a/4.0/large-lms.md b/4.0/large-lms.md
deleted file mode 100644
index a4ba5b7..0000000
--- a/4.0/large-lms.md
+++ /dev/null
@@ -1,192 +0,0 @@
----
-layout: default4
-title: Building large LMs with SRILM
-category: advanced
----
-
-The following is a tutorial for building a large language model from the
-English Gigaword Fifth Edition corpus
-[LDC2011T07](http://www.ldc.upenn.edu/Catalog/catalogEntry.jsp?catalogId=LDC2011T07)
-using SRILM. English text is provided from seven different sources.
-
-### Step 0: Clean up the corpus
-
-The Gigaword corpus has to be stripped of all SGML tags and tokenized.
-Instructions for performing those steps are not included in this
-documentation. A description of this process can be found in a paper
-called ["Annotated
-Gigaword"](https://akbcwekex2012.files.wordpress.com/2012/05/28_paper.pdf).
-
-The Joshua package ships with a script that converts all alphabetical
-characters to their lowercase equivalent. The script is located at
-`$JOSHUA/scripts/lowercase.perl`.
-
-Make a directory structure as follows:
-
-    gigaword/
-    âââ corpus/
-    âÂ Â  âââ afp_eng/
-    âÂ Â  âÂ Â  âââ afp_eng_199405.lc.gz
-    âÂ Â  âÂ Â  âââ afp_eng_199406.lc.gz
-    âÂ Â  âÂ Â  âââ ...
-    âÂ Â  âÂ Â  âââ counts/
-    âÂ Â  âââ apw_eng/
-    âÂ Â  âÂ Â  âââ apw_eng_199411.lc.gz
-    âÂ Â  âÂ Â  âââ apw_eng_199412.lc.gz
-    âÂ Â  âÂ Â  âââ ...
-    âÂ Â  âÂ Â  âââ counts/
-    âÂ Â  âââ cna_eng/
-    âÂ Â  âÂ Â  âââ ...
-    âÂ Â  âÂ Â  âââ counts/
-    âÂ Â  âââ ltw_eng/
-    âÂ Â  âÂ Â  âââ ...
-    âÂ Â  âÂ Â  âââ counts/
-    âÂ Â  âââ nyt_eng/
-    âÂ Â  âÂ Â  âââ ...
-    âÂ Â  âÂ Â  âââ counts/
-    âÂ Â  âââ wpb_eng/
-    âÂ Â  âÂ Â  âââ ...
-    âÂ Â  âÂ Â  âââ counts/
-    âÂ Â  âââ xin_eng/
-    âÂ Â   Â Â  âââ ...
-    âÂ Â   Â Â  âââ counts/
-    âââ lm/
-     Â Â  âââ afp_eng/
-     Â Â  âââ apw_eng/
-     Â Â  âââ cna_eng/
-     Â Â  âââ ltw_eng/
-     Â Â  âââ nyt_eng/
-     Â Â  âââ wpb_eng/
-     Â Â  âââ xin_eng/
-
-
-The next step will be to build smaller LMs and then interpolate them into one
-file.
-
-### Step 1: Count ngrams
-
-Run the following script once from each source directory under the `corpus/`
-directory (edit it to specify the path to the `ngram-count` binary as well as
-the number of processors):
-
-    #!/bin/sh
-
-    NGRAM_COUNT=$SRILM_SRC/bin/i686-m64/ngram-count
-    args=""
-
-    for source in *.gz; do
-       args=$args"-sort -order 5 -text $source -write counts/$source-counts.gz 
"
-    done
-
-    echo $args | xargs --max-procs=4 -n 7 $NGRAM_COUNT
-
-Then move each `counts/` directory to the corresponding directory under
-`lm/`. Now that each ngram has been counted, we can make a language
-model for each of the seven sources.
-
-### Step 2: Make individual language models
-
-SRILM includes a script, called `make-big-lm`, for building large language
-models under resource-limited environments. The manual for this script can be
-read online
-[here](http://www-speech.sri.com/projects/srilm/manpages/training-scripts.1.html).
-Since the Gigaword corpus is so large, it is convenient to use `make-big-lm`
-even in environments with many parallel processors and a lot of memory.
-
-Initiate the following script from each of the source directories under the
-`lm/` directory (edit it to specify the path to the `make-big-lm` script as
-well as the pruning threshold):
-
-    #!/bin/bash
-    set -x
-
-    CMD=$SRILM_SRC/bin/make-big-lm
-    PRUNE_THRESHOLD=1e-8
-
-    $CMD \
-      -name gigalm `for k in counts/*.gz; do echo " \
-      -read $k "; done` \
-      -lm lm.gz \
-      -max-per-file 100000000 \
-      -order 5 \
-      -kndiscount \
-      -interpolate \
-      -unk \
-      -prune $PRUNE_THRESHOLD
-
-The language model attributes chosen are the following:
-
-* N-grams up to order 5
-* Kneser-Ney smoothing
-* N-gram probability estimates at the specified order *n* are interpolated with
-  lower-order estimates
-* include the unknown-word token as a regular word
-* pruning N-grams based on the specified threshold
-
-Next, we will mix the models together into a single file.
-
-### Step 3: Mix models together
-
-Using development text, interpolation weights can determined that give highest
-weight to the source language models that have the lowest perplexity on the
-specified development set.
-
-#### Step 3-1: Determine interpolation weights
-
-Initiate the following script from the `lm/` directory (edit it to specify the
-path to the `ngram` binary as well as the path to the development text file):
-
-    #!/bin/bash
-    set -x
-
-    NGRAM=$SRILM_SRC/bin/i686-m64/ngram
-    DEV_TEXT=~mpost/expts/wmt12/runs/es-en/data/tune/tune.tok.lc.es
-
-    dirs=( afp_eng apw_eng cna_eng ltw_eng nyt_eng wpb_eng xin_eng )
-
-    for d in ${dirs[@]} ; do
-      $NGRAM -debug 2 -order 5 -unk -lm $d/lm.gz -ppl $DEV_TEXT > $d/lm.ppl ;
-    done
-
-    compute-best-mix */lm.ppl > best-mix.ppl
-
-Take a look at the contents of `best-mix.ppl`. It will contain a sequence of
-values in parenthesis. These are the interpolation weights of the source
-language models in the order specified. Copy and paste the values within the
-parenthesis into the script below.
-
-#### Step 3-2: Combine the models
-
-Initiate the following script from the `lm/` directory (edit it to specify the
-path to the `ngram` binary as well as the interpolation weights):
-
-    #!/bin/bash
-    set -x
-
-    NGRAM=$SRILM_SRC/bin/i686-m64/ngram
-    DIRS=(   afp_eng    apw_eng     cna_eng  ltw_eng   nyt_eng  wpb_eng  
xin_eng )
-    LAMBDAS=(0.00631272 0.000647602 0.251555 0.0134726 0.348953 0.371566 
0.00749238)
-
-    $NGRAM -order 5 -unk \
-      -lm      ${DIRS[0]}/lm.gz     -lambda  ${LAMBDAS[0]} \
-      -mix-lm  ${DIRS[1]}/lm.gz \
-      -mix-lm2 ${DIRS[2]}/lm.gz -mix-lambda2 ${LAMBDAS[2]} \
-      -mix-lm3 ${DIRS[3]}/lm.gz -mix-lambda3 ${LAMBDAS[3]} \
-      -mix-lm4 ${DIRS[4]}/lm.gz -mix-lambda4 ${LAMBDAS[4]} \
-      -mix-lm5 ${DIRS[5]}/lm.gz -mix-lambda5 ${LAMBDAS[5]} \
-      -mix-lm6 ${DIRS[6]}/lm.gz -mix-lambda6 ${LAMBDAS[6]} \
-      -write-lm mixed_lm.gz
-
-The resulting file, `mixed_lm.gz` is a language model based on all the text in
-the Gigaword corpus and with some probabilities biased to the development text
-specify in step 3-1. It is in the ARPA format. The optional next step converts
-it into KenLM format.
-
-#### Step 3-3: Convert to KenLM
-
-The KenLM format has some speed advantages over the ARPA format. Issuing the
-following command will write a new language model file `mixed_lm-kenlm.gz` that
-is the `mixed_lm.gz` language model transformed into the KenLM format.
-
-    $JOSHUA/src/joshua/decoder/ff/lm/kenlm/build_binary mixed_lm.gz 
mixed_lm.kenlm
-

http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/lattice.html
----------------------------------------------------------------------
diff --git a/4.0/lattice.html b/4.0/lattice.html
new file mode 100644
index 0000000..74c45fa
--- /dev/null
+++ b/4.0/lattice.html
@@ -0,0 +1,267 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";>
+
+<html xmlns="http://www.w3.org/1999/xhtml"; xml:lang="en" lang="en">
+  <head>
+    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+    <link rel="stylesheet" type="text/css" media="screen,print" 
href="../joshua4.css" />
+    <title>Joshua | Lattice decoding</title>
+  </head>
+
+  <body>
+
+    <div id="navbar">
+      <a href="http://joshua-decoder.org/";>
+        <img src="../images/joshua-logo-small.png" width="130px" 
+             alt="Joshua logo (picture of a Joshua tree)" />
+      </a>
+
+      <p class="infobox">
+        <b>Stable version</b><br />
+        4.1<br/><br/>
+        <b>Release date</b><br />
+        2013 January
+      </p>
+
+<!--       <div class="infobox"> -->
+<!--         <b>AUTO LINKS</b><br/> -->
+<!--         <ul> -->
+<!--            -->
+<!--           <li> Advanced features</li> -->
+<!--            -->
+<!--           <li> Advanced features</li> -->
+<!--            -->
+<!--           <li> Advanced features</li> -->
+<!--            -->
+<!--           <li> Building a language pack</li> -->
+<!--            -->
+<!--           <li> Building a language pack</li> -->
+<!--            -->
+<!--           <li> Bundling a configuration</li> -->
+<!--            -->
+<!--           <li> Contributors</li> -->
+<!--            -->
+<!--           <li> Decoder configuration parameters</li> -->
+<!--            -->
+<!--           <li> Decoder configuration parameters</li> -->
+<!--            -->
+<!--           <li> Decoder configuration parameters</li> -->
+<!--            -->
+<!--           <li> Decoder configuration parameters</li> -->
+<!--            -->
+<!--           <li> Frequently Asked Questions</li> -->
+<!--            -->
+<!--           <li> Common problems</li> -->
+<!--            -->
+<!--           <li> Frequently Asked Questions</li> -->
+<!--            -->
+<!--           <li> Common problems</li> -->
+<!--            -->
+<!--           <li> Features</li> -->
+<!--            -->
+<!--           <li> Features</li> -->
+<!--            -->
+<!--           <li> Features</li> -->
+<!--            -->
+<!--           <li> Features</li> -->
+<!--            -->
+<!--           <li> Joshua file formats</li> -->
+<!--            -->
+<!--           <li> Joshua file formats</li> -->
+<!--            -->
+<!--           <li> Joshua file formats</li> -->
+<!--            -->
+<!--           <li> Joshua file formats</li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--           <li> Fisher and CALLHOME Spanish English Speech Translation 
Corpus</li> -->
+<!--            -->
+<!--           <li> Indian Languages Parallel Corpora</li> -->
+<!--            -->
+<!--           <li> Joshua 4.0 User Documentation</li> -->
+<!--            -->
+<!--           <li> Language packs</li> -->
+<!--            -->
+<!--           <li> Paraphrase Packs</li> -->
+<!--            -->
+<!--           <li> Joshua releases</li> -->
+<!--            -->
+<!--           <li> Support</li> -->
+<!--            -->
+<!--           <li> Getting Started</li> -->
+<!--            -->
+<!--           <li> Welcome to Joshua</li> -->
+<!--            -->
+<!--           <li> Joshua documentation</li> -->
+<!--            -->
+<!--           <li> Joshua documentation</li> -->
+<!--            -->
+<!--           <li> Installation</li> -->
+<!--            -->
+<!--           <li> Installation</li> -->
+<!--            -->
+<!--           <li> Alignment with Jacana</li> -->
+<!--            -->
+<!--           <li> Alignment with Jacana</li> -->
+<!--            -->
+<!--           <li> Alignment with Jacana</li> -->
+<!--            -->
+<!--           <li> Building large LMs with SRILM</li> -->
+<!--            -->
+<!--           <li> Building large LMs with SRILM</li> -->
+<!--            -->
+<!--           <li> Building large LMs with SRILM</li> -->
+<!--            -->
+<!--           <li> Building large LMs with SRILM</li> -->
+<!--            -->
+<!--           <li> Lattice decoding</li> -->
+<!--            -->
+<!--           <li> Grammar Packing</li> -->
+<!--            -->
+<!--           <li> Grammar Packing</li> -->
+<!--            -->
+<!--           <li> Grammar Packing</li> -->
+<!--            -->
+<!--           <li> Grammar Packing</li> -->
+<!--            -->
+<!--           <li> The Joshua Pipeline</li> -->
+<!--            -->
+<!--           <li> The Joshua Pipeline</li> -->
+<!--            -->
+<!--           <li> The Joshua Pipeline</li> -->
+<!--            -->
+<!--           <li> The Joshua Pipeline</li> -->
+<!--            -->
+<!--           <li> Quick Start</li> -->
+<!--            -->
+<!--           <li> Quick Start</li> -->
+<!--            -->
+<!--           <li> Releases</li> -->
+<!--            -->
+<!--           <li> Server mode</li> -->
+<!--            -->
+<!--           <li> Server mode</li> -->
+<!--            -->
+<!--           <li> Server mode</li> -->
+<!--            -->
+<!--           <li> Installing and running the Joshua Decoder</li> -->
+<!--            -->
+<!--           <li> Grammar extraction with Thrax</li> -->
+<!--            -->
+<!--           <li> Grammar extraction with Thrax</li> -->
+<!--            -->
+<!--           <li> Grammar extraction with Thrax</li> -->
+<!--            -->
+<!--           <li> Grammar extraction with Thrax</li> -->
+<!--            -->
+<!--           <li> Building Translation Models</li> -->
+<!--            -->
+<!--           <li> Building Translation Models</li> -->
+<!--            -->
+<!--           <li> Building Translation Models</li> -->
+<!--            -->
+<!--           <li> Building Translation Models</li> -->
+<!--            -->
+<!--           <li> Pipeline tutorial</li> -->
+<!--            -->
+<!--           <li> Pipeline tutorial</li> -->
+<!--            -->
+<!--           <li> Pipeline tutorial</li> -->
+<!--            -->
+<!--           <li> What's New</li> -->
+<!--            -->
+<!--           <li> What's New</li> -->
+<!--            -->
+<!--           <li> Z-MERT</li> -->
+<!--            -->
+<!--           <li> Z-MERT</li> -->
+<!--            -->
+<!--           <li> Z-MERT</li> -->
+<!--            -->
+<!--           <li> Z-MERT</li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--         </ul> -->
+<!--       </div>   -->
+
+      <div class="infobox">
+
+        <b>Links</b><br />
+        <ul>
+          <li> <a href="../index.html">Main</a> </li>
+          <li> <a href="pipeline.html">Pipeline</a> </li>
+          <li> <a href="step-by-step-instructions.html">Manual walkthrough</a> 
</li>
+          <li> <a href="decoder.html">Decoder</a> </li>
+          <li> <a href="server.html">Decoder Server</a> </li>
+          <li> <a href="file-formats.html">File formats</a> </li>
+          <li> <a href="thrax.html">Grammar Extraction</a> </li>
+          <li> <a href="../releases.html">Releases</a> </li>
+        </ul>
+      </div>
+
+      <div class="infobox">
+        <b>Advanced</b><br />
+        <ul>
+<!--          <li> <a href="packing.html">Grammar packing</a> </li> -->
+          <li> <a href="large-lms.html">Building large LMs</a> </li>
+          <li> <a href="zmert.html">Running Z-MERT</a> </li>
+          <li> <a href="lattice.html">Lattices</a> </li>
+          <li> <a href="server.html">TCP/IP server</a> </li>
+          <li> <a href="bundle.html">Bundled configuration</a> </li>
+        </ul>
+      </div>
+
+      <div class="infobox">
+        <b>Help</b><br />
+        <ul>
+          <li> <a href="faq.html">Answers</a> </li>
+          <li> <a 
href="https://groups.google.com/d/forum/joshua_support";>Archive</a> </li>
+        </ul>
+      </div>
+
+      <div class="footer">
+        Last updated on April 08, 2016
+      </div>
+
+    </div>
+
+    <div id="main">
+      <div id="title">
+        <h1>Lattice decoding</h1>
+      </div>
+
+      <div id="content">
+        
+        <p>In addition to regular sentences, Joshua can decode weighted 
lattices encoded in <a 
href="http://www.statmt.org/moses/?n=Moses.WordLattices";>the PLF
+format</a>.  Lattice decoding was originally added
+by Lane Schwartz and <a href="http://www.cs.cmu.edu/~cdyer/";>Chris 
Dyer</a>.</p>
+
+<p>Joshua will automatically detect whether the input sentence is a regular 
sentence
+(the usual case) or a lattice.  If a lattice, a feature will be activated that 
accumulates the cost
+of different paths through the lattice.  In this case, you need to ensure that 
a weight for this
+feature is present in <a href="decoder.html">your model file</a>.</p>
+
+<p>The main caveats with Joshuaâs PLF lattice support are that the lattice 
needs to be listed on a
+single line.</p>
+
+
+      </div>
+    </div>
+
+  </body>
+</html>
+
+
+
+
+

http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/lattice.md
----------------------------------------------------------------------
diff --git a/4.0/lattice.md b/4.0/lattice.md
deleted file mode 100644
index 5d6bd47..0000000
--- a/4.0/lattice.md
+++ /dev/null
@@ -1,17 +0,0 @@
----
-layout: default4
-category: advanced
-title: Lattice decoding
----
-
-In addition to regular sentences, Joshua can decode weighted lattices encoded 
in [the PLF
-format](http://www.statmt.org/moses/?n=Moses.WordLattices).  Lattice decoding 
was originally added
-by Lane Schwartz and [Chris Dyer](http://www.cs.cmu.edu/~cdyer/).
-
-Joshua will automatically detect whether the input sentence is a regular 
sentence
-(the usual case) or a lattice.  If a lattice, a feature will be activated that 
accumulates the cost
-of different paths through the lattice.  In this case, you need to ensure that 
a weight for this
-feature is present in [your model file](decoder.html).
-
-The main caveats with Joshua's PLF lattice support are that the lattice needs 
to be listed on a
-single line.

http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/packing.html
----------------------------------------------------------------------
diff --git a/4.0/packing.html b/4.0/packing.html
new file mode 100644
index 0000000..cdfa675
--- /dev/null
+++ b/4.0/packing.html
@@ -0,0 +1,357 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";>
+
+<html xmlns="http://www.w3.org/1999/xhtml"; xml:lang="en" lang="en">
+  <head>
+    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+    <link rel="stylesheet" type="text/css" media="screen,print" 
href="../joshua4.css" />
+    <title>Joshua | Grammar Packing</title>
+  </head>
+
+  <body>
+
+    <div id="navbar">
+      <a href="http://joshua-decoder.org/";>
+        <img src="../images/joshua-logo-small.png" width="130px" 
+             alt="Joshua logo (picture of a Joshua tree)" />
+      </a>
+
+      <p class="infobox">
+        <b>Stable version</b><br />
+        4.1<br/><br/>
+        <b>Release date</b><br />
+        2013 January
+      </p>
+
+<!--       <div class="infobox"> -->
+<!--         <b>AUTO LINKS</b><br/> -->
+<!--         <ul> -->
+<!--            -->
+<!--           <li> Advanced features</li> -->
+<!--            -->
+<!--           <li> Advanced features</li> -->
+<!--            -->
+<!--           <li> Advanced features</li> -->
+<!--            -->
+<!--           <li> Building a language pack</li> -->
+<!--            -->
+<!--           <li> Building a language pack</li> -->
+<!--            -->
+<!--           <li> Bundling a configuration</li> -->
+<!--            -->
+<!--           <li> Contributors</li> -->
+<!--            -->
+<!--           <li> Decoder configuration parameters</li> -->
+<!--            -->
+<!--           <li> Decoder configuration parameters</li> -->
+<!--            -->
+<!--           <li> Decoder configuration parameters</li> -->
+<!--            -->
+<!--           <li> Decoder configuration parameters</li> -->
+<!--            -->
+<!--           <li> Frequently Asked Questions</li> -->
+<!--            -->
+<!--           <li> Common problems</li> -->
+<!--            -->
+<!--           <li> Frequently Asked Questions</li> -->
+<!--            -->
+<!--           <li> Common problems</li> -->
+<!--            -->
+<!--           <li> Features</li> -->
+<!--            -->
+<!--           <li> Features</li> -->
+<!--            -->
+<!--           <li> Features</li> -->
+<!--            -->
+<!--           <li> Features</li> -->
+<!--            -->
+<!--           <li> Joshua file formats</li> -->
+<!--            -->
+<!--           <li> Joshua file formats</li> -->
+<!--            -->
+<!--           <li> Joshua file formats</li> -->
+<!--            -->
+<!--           <li> Joshua file formats</li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--           <li> Fisher and CALLHOME Spanish English Speech Translation 
Corpus</li> -->
+<!--            -->
+<!--           <li> Indian Languages Parallel Corpora</li> -->
+<!--            -->
+<!--           <li> Joshua 4.0 User Documentation</li> -->
+<!--            -->
+<!--           <li> Language packs</li> -->
+<!--            -->
+<!--           <li> Paraphrase Packs</li> -->
+<!--            -->
+<!--           <li> Joshua releases</li> -->
+<!--            -->
+<!--           <li> Support</li> -->
+<!--            -->
+<!--           <li> Getting Started</li> -->
+<!--            -->
+<!--           <li> Welcome to Joshua</li> -->
+<!--            -->
+<!--           <li> Joshua documentation</li> -->
+<!--            -->
+<!--           <li> Joshua documentation</li> -->
+<!--            -->
+<!--           <li> Installation</li> -->
+<!--            -->
+<!--           <li> Installation</li> -->
+<!--            -->
+<!--           <li> Alignment with Jacana</li> -->
+<!--            -->
+<!--           <li> Alignment with Jacana</li> -->
+<!--            -->
+<!--           <li> Alignment with Jacana</li> -->
+<!--            -->
+<!--           <li> Building large LMs with SRILM</li> -->
+<!--            -->
+<!--           <li> Building large LMs with SRILM</li> -->
+<!--            -->
+<!--           <li> Building large LMs with SRILM</li> -->
+<!--            -->
+<!--           <li> Building large LMs with SRILM</li> -->
+<!--            -->
+<!--           <li> Lattice decoding</li> -->
+<!--            -->
+<!--           <li> Grammar Packing</li> -->
+<!--            -->
+<!--           <li> Grammar Packing</li> -->
+<!--            -->
+<!--           <li> Grammar Packing</li> -->
+<!--            -->
+<!--           <li> Grammar Packing</li> -->
+<!--            -->
+<!--           <li> The Joshua Pipeline</li> -->
+<!--            -->
+<!--           <li> The Joshua Pipeline</li> -->
+<!--            -->
+<!--           <li> The Joshua Pipeline</li> -->
+<!--            -->
+<!--           <li> The Joshua Pipeline</li> -->
+<!--            -->
+<!--           <li> Quick Start</li> -->
+<!--            -->
+<!--           <li> Quick Start</li> -->
+<!--            -->
+<!--           <li> Releases</li> -->
+<!--            -->
+<!--           <li> Server mode</li> -->
+<!--            -->
+<!--           <li> Server mode</li> -->
+<!--            -->
+<!--           <li> Server mode</li> -->
+<!--            -->
+<!--           <li> Installing and running the Joshua Decoder</li> -->
+<!--            -->
+<!--           <li> Grammar extraction with Thrax</li> -->
+<!--            -->
+<!--           <li> Grammar extraction with Thrax</li> -->
+<!--            -->
+<!--           <li> Grammar extraction with Thrax</li> -->
+<!--            -->
+<!--           <li> Grammar extraction with Thrax</li> -->
+<!--            -->
+<!--           <li> Building Translation Models</li> -->
+<!--            -->
+<!--           <li> Building Translation Models</li> -->
+<!--            -->
+<!--           <li> Building Translation Models</li> -->
+<!--            -->
+<!--           <li> Building Translation Models</li> -->
+<!--            -->
+<!--           <li> Pipeline tutorial</li> -->
+<!--            -->
+<!--           <li> Pipeline tutorial</li> -->
+<!--            -->
+<!--           <li> Pipeline tutorial</li> -->
+<!--            -->
+<!--           <li> What's New</li> -->
+<!--            -->
+<!--           <li> What's New</li> -->
+<!--            -->
+<!--           <li> Z-MERT</li> -->
+<!--            -->
+<!--           <li> Z-MERT</li> -->
+<!--            -->
+<!--           <li> Z-MERT</li> -->
+<!--            -->
+<!--           <li> Z-MERT</li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--           <li> </li> -->
+<!--            -->
+<!--         </ul> -->
+<!--       </div>   -->
+
+      <div class="infobox">
+
+        <b>Links</b><br />
+        <ul>
+          <li> <a href="../index.html">Main</a> </li>
+          <li> <a href="pipeline.html">Pipeline</a> </li>
+          <li> <a href="step-by-step-instructions.html">Manual walkthrough</a> 
</li>
+          <li> <a href="decoder.html">Decoder</a> </li>
+          <li> <a href="server.html">Decoder Server</a> </li>
+          <li> <a href="file-formats.html">File formats</a> </li>
+          <li> <a href="thrax.html">Grammar Extraction</a> </li>
+          <li> <a href="../releases.html">Releases</a> </li>
+        </ul>
+      </div>
+
+      <div class="infobox">
+        <b>Advanced</b><br />
+        <ul>
+<!--          <li> <a href="packing.html">Grammar packing</a> </li> -->
+          <li> <a href="large-lms.html">Building large LMs</a> </li>
+          <li> <a href="zmert.html">Running Z-MERT</a> </li>
+          <li> <a href="lattice.html">Lattices</a> </li>
+          <li> <a href="server.html">TCP/IP server</a> </li>
+          <li> <a href="bundle.html">Bundled configuration</a> </li>
+        </ul>
+      </div>
+
+      <div class="infobox">
+        <b>Help</b><br />
+        <ul>
+          <li> <a href="faq.html">Answers</a> </li>
+          <li> <a 
href="https://groups.google.com/d/forum/joshua_support";>Archive</a> </li>
+        </ul>
+      </div>
+
+      <div class="footer">
+        Last updated on April 08, 2016
+      </div>
+
+    </div>
+
+    <div id="main">
+      <div id="title">
+        <h1>Grammar Packing</h1>
+      </div>
+
+      <div id="content">
+        
+        <p>Grammar packing refers to the process of taking a textual grammar 
output by <a href="thrax.html">Thrax</a> and
+efficiently encoding it for use by Joshua.  Packing the grammar results in 
significantly faster load
+times for very large grammars.</p>
+
+<p>Soon, the <a href="pipeline.html">Joshua pipeline script</a> will add 
support for grammar packing
+automatically, and we will provide a script that automates these steps for 
you.</p>
+
+<ol>
+  <li>
+    <p>Make sure the grammar is labeled.  A labeled grammar is one that has 
feature names attached to
+each of the feature values in each row of the grammar file.  Here is a line 
from an unlabeled
+grammar:</p>
+
+    <div class="highlighter-rouge"><pre class="highlight"><code> [X] ||| [X,1] 
à¦à¦¨à§à¦¯à¦¾à¦¨à§à¦¯ [X,2] ||| [X,1] other [X,2] ||| 0 0 1 0 0 1.02184
+</code></pre>
+    </div>
+
+    <p>and here is one from an labeled grammar (note that the labels are not 
very useful):</p>
+
+    <div class="highlighter-rouge"><pre class="highlight"><code> [X] ||| [X,1] 
à¦à¦¨à§à¦¯à¦¾à¦¨à§à¦¯ [X,2] ||| [X,1] other [X,2] ||| f1=0 f2=0 f3=1 f4=0 
f5=0 f6=1.02184
+</code></pre>
+    </div>
+
+    <p>If your grammar is not labeled, you can use the script <code 
class="highlighter-rouge">$JOSHUA/scripts/label_grammar.py</code>:</p>
+
+    <div class="highlighter-rouge"><pre class="highlight"><code> zcat 
grammar.gz | $JOSHUA/scripts/label_grammar.py &gt; grammar-labeled.gz
+</code></pre>
+    </div>
+
+    <p>As a side-effect of this step is to produce a file âdense_mapâ in 
the current directory,
+containing the mapping between feature names and feature columns.  This file 
is needed in later
+steps.</p>
+  </li>
+  <li>
+    <p>The packer needs a sorted grammar.  It is sufficient to sort by the 
first word:</p>
+
+    <div class="highlighter-rouge"><pre class="highlight"><code> zcat 
grammar-labeled.gz | sort -k3,3 | gzip &gt; grammar-sorted.gz
+</code></pre>
+    </div>
+
+    <p>(The reason we need a sorted grammar is because the packer stores the 
grammar in a trie.  The
+pieces canât be more than 2 GB due to Java limitations, so we need to ensure 
that rules are
+grouped by the first arc in the trie to avoid redundancy across tries and to 
simplify the
+lookup).</p>
+  </li>
+  <li>
+    <p>In order to pack the grammar, we need two pieces of information: (1) a 
packer configuration file,
+and (2) a dense map file.</p>
+
+    <ol>
+      <li>
+        <p>Write a packer config file.  This file specifies items such as the 
chunk size (for the packed
+pieces) and the quantization classes and types for each feature name.  
Examples can be found
+at</p>
+
+        <div class="highlighter-rouge"><pre class="highlight"><code>  
$JOSHUA/test/packed/packer.config
+  $JOSHUA/test/bn-en/packed/packer.quantized
+  $JOSHUA/test/bn-en/packed/packer.uncompressed
+</code></pre>
+        </div>
+
+        <p>The quantizer lines in the packer config file have the following 
format:</p>
+
+        <div class="highlighter-rouge"><pre class="highlight"><code>  
quantizer TYPE FEATURES
+</code></pre>
+        </div>
+
+        <p>where <code class="highlighter-rouge">TYPE</code> is one of <code 
class="highlighter-rouge">boolean</code>, <code 
class="highlighter-rouge">float</code>, <code 
class="highlighter-rouge">byte</code>, or <code 
class="highlighter-rouge">8bit</code>, and <code 
class="highlighter-rouge">FEATURES</code> is a
+ space-delimited list of feature names that have that quantization type.</p>
+      </li>
+      <li>
+        <p>Write a dense_map file.  If you labeled an unlabeled grammar, this 
was produced for you as a
+side product of the <code class="highlighter-rouge">label_grammar.py</code> 
script you called in Step 1.  Otherwise, you need to
+create a file that lists the mapping between feature names and (0-indexed) 
columns in the
+grammar, one per line, in the following format:</p>
+
+        <div class="highlighter-rouge"><pre class="highlight"><code>  
feature-index feature-name
+</code></pre>
+        </div>
+      </li>
+    </ol>
+  </li>
+  <li>
+    <p>To pack the grammar, type the following command:</p>
+
+    <div class="highlighter-rouge"><pre class="highlight"><code> java -cp 
$JOSHUA/bin joshua.tools.GrammarPacker -c PACKER_CONFIG_FILE -p OUTPUT_DIR -g 
GRAMMAR_FILE
+</code></pre>
+    </div>
+
+    <p>This will read in your packer configuration file and your grammar, and 
produced a packed grammar
+ in the output directory.</p>
+  </li>
+  <li>
+    <p>To use the packed grammar, just point to the packed directory in your 
Joshua configuration file.</p>
+
+    <div class="highlighter-rouge"><pre class="highlight"><code> tm-file = 
packed-grammar/
+ tm-format = packed
+</code></pre>
+    </div>
+  </li>
+</ol>
+
+
+      </div>
+    </div>
+
+  </body>
+</html>
+
+
+
+
+

[42/44] incubator-joshua-site git commit: First attempt

Reply via email to