[37/44] incubator-joshua-site git commit: First attempt

mjpost Fri, 08 Apr 2016 20:11:12 -0700

http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/5.0/pipeline.html
----------------------------------------------------------------------
diff --git a/5.0/pipeline.html b/5.0/pipeline.html
new file mode 100644
index 0000000..bb56bee
--- /dev/null
+++ b/5.0/pipeline.html
@@ -0,0 +1,919 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <title>Joshua Documentation | The Joshua Pipeline</title>
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <meta name="description" content="">
+    <meta name="author" content="">
+
+    <!-- Le styles -->
+    <link href="/bootstrap/css/bootstrap.css" rel="stylesheet">
+    <style>
+      body {
+        padding-top: 60px; /* 60px to make the container go all the way to the 
bottom of the topbar */
+      }
+      #download {
+          background-color: green;
+          font-size: 14pt;
+          font-weight: bold;
+          text-align: center;
+          color: white;
+          border-radius: 5px;
+          padding: 4px;
+      }
+
+      #download a:link {
+          color: white;
+      }
+
+      #download a:hover {
+          color: lightgrey;
+      }
+
+      #download a:visited {
+          color: white;
+      }
+
+      a.pdf {
+          font-variant: small-caps;
+          /* font-weight: bold; */
+          font-size: 10pt;
+          color: white;
+          background: brown;
+          padding: 2px;
+      }
+
+      a.bibtex {
+          font-variant: small-caps;
+          /* font-weight: bold; */
+          font-size: 10pt;
+          color: white;
+          background: orange;
+          padding: 2px;
+      }
+
+      img.sponsor {
+        height: 120px;
+        margin: 5px;
+      }
+    </style>
+    <link href="bootstrap/css/bootstrap-responsive.css" rel="stylesheet">
+
+    <!-- HTML5 shim, for IE6-8 support of HTML5 elements -->
+    <!--[if lt IE 9]>
+      <script src="bootstrap/js/html5shiv.js"></script>
+    <![endif]-->
+
+    <!-- Fav and touch icons -->
+    <link rel="apple-touch-icon-precomposed" sizes="144x144" 
href="bootstrap/ico/apple-touch-icon-144-precomposed.png">
+    <link rel="apple-touch-icon-precomposed" sizes="114x114" 
href="bootstrap/ico/apple-touch-icon-114-precomposed.png">
+      <link rel="apple-touch-icon-precomposed" sizes="72x72" 
href="bootstrap/ico/apple-touch-icon-72-precomposed.png">
+                    <link rel="apple-touch-icon-precomposed" 
href="bootstrap/ico/apple-touch-icon-57-precomposed.png">
+                                   <link rel="shortcut icon" 
href="bootstrap/ico/favicon.png">
+  </head>
+
+  <body>
+
+    <div class="navbar navbar-inverse navbar-fixed-top">
+      <div class="navbar-inner">
+        <div class="container">
+          <button type="button" class="btn btn-navbar" data-toggle="collapse" 
data-target=".nav-collapse">
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+          </button>
+          <a class="brand" href="/">Joshua</a>
+          <div class="nav-collapse collapse">
+            <ul class="nav">
+              <li><a href="index.html">Documentation</a></li>
+              <li><a href="pipeline.html">Pipeline</a></li>
+              <li><a href="tutorial.html">Tutorial</a></li>
+              <li><a href="decoder.html">Decoder</a></li>
+              <li><a href="thrax.html">Thrax</a></li>
+              <li><a href="file-formats.html">File formats</a></li>
+              <!-- <li><a href="advanced.html">Advanced</a></li> -->
+              <li><a href="faq.html">FAQ</a></li>
+            </ul>
+          </div><!--/.nav-collapse -->
+        </div>
+      </div>
+    </div>
+
+    <div class="container">
+
+      <div class="row">
+        <div class="span2">
+          <img src="/images/joshua-logo-small.png" 
+               alt="Joshua logo (picture of a Joshua tree)" />
+        </div>
+        <div class="span10">
+          <h1>Joshua Documentation</h1>
+          <h2>The Joshua Pipeline</h2>
+          <span id="download">
+            <a 
href="http://cs.jhu.edu/~post/files/joshua-v5.0.tgz";>Download</a>
+          </span>
+          &nbsp; (version 5.0, released 16 August 2013)
+        </div>
+      </div>
+      
+      <hr />
+
+      <div class="row">
+        <div class="span8">
+
+          <p>This page describes the Joshua pipeline script, which manages the 
complexity of training and
+evaluating machine translation systems.  The pipeline eases the pain of two 
related tasks in
+statistical machine translation (SMT) research:</p>
+
+<ul>
+  <li>
+    <p>Training SMT systems involves a complicated process of interacting 
steps that are
+time-consuming and prone to failure.</p>
+  </li>
+  <li>
+    <p>Developing and testing new techniques requires varying parameters at 
different points in the
+pipeline. Earlier results (which are often expensive) need not be 
recomputed.</p>
+  </li>
+</ul>
+
+<p>To facilitate these tasks, the pipeline script:</p>
+
+<ul>
+  <li>
+    <p>Runs the complete SMT pipeline, from corpus normalization and 
tokenization, through alignment,
+model building, tuning, test-set decoding, and evaluation.</p>
+  </li>
+  <li>
+    <p>Caches the results of intermediate steps (using robust SHA-1 checksums 
on dependencies), so the
+pipeline can be debugged or shared across similar runs while doing away with 
time spent
+recomputing expensive steps.</p>
+  </li>
+  <li>
+    <p>Allows you to jump into and out of the pipeline at a set of predefined 
places (e.g., the alignment
+stage), so long as you provide the missing dependencies.</p>
+  </li>
+</ul>
+
+<p>The Joshua pipeline script is designed in the spirit of Mosesâ <code 
class="highlighter-rouge">train-model.pl</code>, and shares many of
+its features.  It is not as extensive, however, as Mosesâ
+<a href="http://www.statmt.org/moses/?n=FactoredTraining.EMS";>Experiment 
Management System</a>, which allows
+the user to define arbitrary execution dependency graphs.</p>
+
+<h2 id="installation">Installation</h2>
+
+<p>The pipeline has no <em>required</em> external dependencies.  However, it 
has support for a number of
+external packages, some of which are included with Joshua.</p>
+
+<ul>
+  <li>
+    <p><a href="http://code.google.com/p/giza-pp/";>GIZA++</a> (included)</p>
+
+    <p>GIZA++ is the default aligner.  It is included with Joshua, and should 
compile successfully when
+you typed <code class="highlighter-rouge">ant</code> from the Joshua root 
directory.  It is not required because you can use the
+(included) Berkeley aligner (<code class="highlighter-rouge">--aligner 
berkeley</code>). We have recently also provided support
+for the <a href="http://code.google.com/p/jacana-xy/wiki/JacanaXY";>Jacana-XY 
aligner</a> (<code class="highlighter-rouge">--aligner
+jacana</code>). </p>
+  </li>
+  <li>
+    <p><a href="http://hadoop.apache.org/";>Hadoop</a> (included)</p>
+
+    <p>The pipeline uses the <a href="thrax.html">Thrax grammar extractor</a>, 
which is built on Hadoop.  If you
+have a Hadoop installation, simply ensure that the <code 
class="highlighter-rouge">$HADOOP</code> environment variable is defined, and
+the pipeline will use it automatically at the grammar extraction step.  If you 
are going to
+attempt to extract very large grammars, it is best to have a good-sized Hadoop 
installation.</p>
+
+    <p>(If you do not have a Hadoop installation, you might consider setting 
one up.  Hadoop can be
+installed in a
+<a 
href="http://hadoop.apache.org/common/docs/r0.20.2/quickstart.html#PseudoDistributed";>âpseudo-distributedâ</a>
+mode that allows it to use just a few machines or a number of processors on a 
single machine.
+The main issue is to ensure that there are a lot of independent physical 
disks, since in our
+experience Hadoop starts to exhibit lots of hard-to-trace problems if there is 
too much demand on
+the disks.)</p>
+
+    <p>If you donât have a Hadoop installation, there are still no worries.  
The pipeline will unroll a
+standalone installation and use it to extract your grammar.  This behavior 
will be triggered if
+<code class="highlighter-rouge">$HADOOP</code> is undefined.</p>
+  </li>
+  <li>
+    <p><a href="http://www.speech.sri.com/projects/srilm/";>SRILM</a> (not 
included)</p>
+
+    <p>By default, the pipeline uses a Java program from the
+<a href="http://code.google.com/p/berkeleylm/";>Berkeley LM</a> package that 
constructs an
+Kneser-Ney-smoothed language model in ARPA format from the target side of your 
training data.  If
+you wish to use SRILM instead, you need to do the following:</p>
+
+    <ol>
+      <li>Install SRILM and set the <code 
class="highlighter-rouge">$SRILM</code> environment variable to point to its 
installed location.</li>
+      <li>Add the <code class="highlighter-rouge">--lm-gen srilm</code> flag 
to your pipeline invocation.</li>
+    </ol>
+
+    <p>More information on this is available in the <a href="#lm">LM building 
section of the pipeline</a>.  SRILM
+is not used for representing language models during decoding (and in fact is 
not supported,
+having been supplanted by <a href="http://kheafield.com/code/kenlm/";>KenLM</a> 
(the default) and
+BerkeleyLM).</p>
+  </li>
+  <li>
+    <p><a href="http://statmt.org/moses/";>Moses</a> (not included)</p>
+  </li>
+</ul>
+
+<p>Make sure that the environment variable <code 
class="highlighter-rouge">$JOSHUA</code> is defined, and you should be all 
set.</p>
+
+<h2 id="a-basic-pipeline-run">A basic pipeline run</h2>
+
+<p>The pipeline takes a set of inputs (training, tuning, and test data), and 
creates a set of
+intermediate files in the <em>run directory</em>.  By default, the run 
directory is the current directory,
+but it can be changed with the <code class="highlighter-rouge">--rundir</code> 
parameter.</p>
+
+<p>For this quick start, we will be working with the example that can be found 
in
+<code class="highlighter-rouge">$JOSHUA/examples/pipeline</code>.  This 
example contains 1,000 sentences of Urdu-English data (the full
+dataset is available as part of the
+<a href="/indian-parallel-corpora/">Indian languages parallel corpora</a> with
+100-sentence tuning and test sets with four references each.</p>
+
+<p>Running the pipeline requires two main steps: data preparation and 
invocation.</p>
+
+<ol>
+  <li>
+    <p>Prepare your data.  The pipeline script needs to be told where to find 
the raw training, tuning,
+and test data.  A good convention is to place these files in an input/ 
subdirectory of your runâs
+working directory (NOTE: do not use <code 
class="highlighter-rouge">data/</code>, since a directory of that name is 
created and used
+by the pipeline itself for storing processed files).  The expected format (for 
each of training,
+tuning, and test) is a pair of files that share a common path prefix and are 
distinguished by
+their extension, e.g.,</p>
+
+    <div class="highlighter-rouge"><pre class="highlight"><code>input/
+      train.SOURCE
+      train.TARGET
+      tune.SOURCE
+      tune.TARGET
+      test.SOURCE
+      test.TARGET
+</code></pre>
+    </div>
+
+    <p>These files should be parallel at the sentence level (with one sentence 
per line), should be in
+UTF-8, and should be untokenized (tokenization occurs in the pipeline).  
SOURCE and TARGET denote
+variables that should be replaced with the actual target and source language 
abbreviations (e.g.,
+âurâ and âenâ).</p>
+  </li>
+  <li>
+    <p>Run the pipeline.  The following is the minimal invocation to run the 
complete pipeline:</p>
+
+    <div class="highlighter-rouge"><pre 
class="highlight"><code>$JOSHUA/bin/pipeline.pl  \
+  --corpus input/train   \
+  --tune input/tune      \
+  --test input/devtest   \
+  --source SOURCE        \
+  --target TARGET
+</code></pre>
+    </div>
+
+    <p>The <code class="highlighter-rouge">--corpus</code>, <code 
class="highlighter-rouge">--tune</code>, and <code 
class="highlighter-rouge">--test</code> flags define file prefixes that are 
concatened with the
+language extensions given by <code class="highlighter-rouge">--target</code> 
and <code class="highlighter-rouge">--source</code> (with a â.â in 
between).  Note the
+correspondences with the files defined in the first step above.  The prefixes 
can be either
+absolute or relative pathnames.  This particular invocation assumes that a 
subdirectory <code class="highlighter-rouge">input/</code>
+exists in the current directory, that you are translating from a language 
identified âurâ
+extension to a language identified by the âenâ extension, that the 
training data can be found at
+<code class="highlighter-rouge">input/train.en</code> and <code 
class="highlighter-rouge">input/train.ur</code>, and so on.</p>
+  </li>
+</ol>
+
+<p><em>Donât</em> run the pipeline directly from <code 
class="highlighter-rouge">$JOSHUA</code>. We recommend creating a run directory 
somewhere
+ else to contain all of your experiments in some other location. The advantage 
to this (apart from
+ not clobbering part of the Joshua install) is that Joshua provides support 
scripts for visualizing
+ the results of a series of experiments that only work if you</p>
+
+<p>Assuming no problems arise, this command will run the complete pipeline in 
about 20 minutes,
+producing BLEU scores at the end.  As it runs, you will see output that looks 
like the following:</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>[train-copy-en] 
rebuilding...
+  dep=/Users/post/code/joshua/test/pipeline/input/train.en 
+  dep=data/train/train.en.gz [NOT FOUND]
+  cmd=cat /Users/post/code/joshua/test/pipeline/input/train.en | gzip -9n &gt; 
data/train/train.en.gz
+  took 0 seconds (0s)
+[train-copy-ur] rebuilding...
+  dep=/Users/post/code/joshua/test/pipeline/input/train.ur 
+  dep=data/train/train.ur.gz [NOT FOUND]
+  cmd=cat /Users/post/code/joshua/test/pipeline/input/train.ur | gzip -9n &gt; 
data/train/train.ur.gz
+  took 0 seconds (0s)
+...
+</code></pre>
+</div>
+
+<p>And in the current directory, you will see the following files (among other 
intermediate files
+generated by the individual sub-steps).</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>data/
+    train/
+        corpus.ur
+        corpus.en
+        thrax-input-file
+    tune/
+        tune.tok.lc.ur
+        tune.tok.lc.en
+        grammar.filtered.gz
+        grammar.glue
+    test/
+        test.tok.lc.ur
+        test.tok.lc.en
+        grammar.filtered.gz
+        grammar.glue
+alignments/
+    0/
+        [giza/berkeley aligner output files]
+    training.align
+thrax-hiero.conf
+thrax.log
+grammar.gz
+lm.gz
+tune/
+    1/
+        decoder_command
+        joshua.config
+        params.txt
+        joshua.log
+        mert.log
+        joshua.config.ZMERT.final
+    final-bleu
+</code></pre>
+</div>
+
+<p>These files will be described in more detail in subsequent sections of this 
tutorial.</p>
+
+<p>Another useful flag is the <code class="highlighter-rouge">--rundir 
DIR</code> flag, which chdir()s to the specified directory before
+running the pipeline.  By default the rundir is the current directory.  
Changing it can be useful
+for organizing related pipeline runs.  Relative paths specified to other flags 
(e.g., to <code class="highlighter-rouge">--corpus</code>
+or <code class="highlighter-rouge">--lmfile</code>) are relative to the 
directory the pipeline was called <em>from</em>, not the rundir itself
+(unless they happen to be the same, of course).</p>
+
+<p>The complete pipeline comprises many tens of small steps, which can be 
grouped together into a set
+of traditional pipeline tasks:</p>
+
+<ol>
+  <li><a href="#prep">Data preparation</a></li>
+  <li><a href="#alignment">Alignment</a></li>
+  <li><a href="#parsing">Parsing</a> (syntax-based grammars only)</li>
+  <li><a href="#tm">Grammar extraction</a></li>
+  <li><a href="#lm">Language model building</a></li>
+  <li><a href="#tuning">Tuning</a></li>
+  <li><a href="#testing">Testing</a></li>
+  <li><a href="#analysis">Analysis</a></li>
+</ol>
+
+<p>These steps are discussed below, after a few intervening sections about 
high-level details of the
+pipeline.</p>
+
+<h2 id="managing-groups-of-experiments">Managing groups of experiments</h2>
+
+<p>The real utility of the pipeline comes when you use it to manage groups of 
experiments. Typically,
+there is a held-out test set, and we want to vary a number of training 
parameters to determine what
+effect this has on BLEU scores or some other metric. Joshua comes with a script
+<code class="highlighter-rouge">$JOSHUA/scripts/training/summarize.pl</code> 
that collects information from a group of runs and reports
+them to you. This script works so long as you organize your runs as 
follows:</p>
+
+<ol>
+  <li>
+    <p>Your runs should be grouped together in a root directory, which Iâll 
call <code class="highlighter-rouge">$RUNDIR</code>.</p>
+  </li>
+  <li>
+    <p>For comparison purposes, the runs should all be evaluated on the same 
test set.</p>
+  </li>
+  <li>
+    <p>Each run in the run group should be in its own numbered directory, 
shown with the files used by
+the summarize script:</p>
+
+    <div class="highlighter-rouge"><pre class="highlight"><code>$RUNDIR/
+    1/
+        README.txt
+        test/
+            final-bleu
+            final-times
+        [other files]
+    2/
+        README.txt
+        ...
+</code></pre>
+    </div>
+  </li>
+</ol>
+
+<p>You can get such directories using the <code 
class="highlighter-rouge">--rundir N</code> flag to the pipeline. </p>
+
+<p>Run directories can build off each other. For example, <code 
class="highlighter-rouge">1/</code> might contain a complete baseline
+run. If you wanted to just change the tuner, you donât need to rerun the 
aligner and model builder,
+so you can reuse the results by supplying the second run with the information 
it needs that was
+computed in step 1:</p>
+
+<div class="highlighter-rouge"><pre 
class="highlight"><code>$JOSHUA/bin/pipeline.pl \
+  --first-step tune \
+  --grammar 1/grammar.gz \
+  ...
+</code></pre>
+</div>
+
+<p>More details are below.</p>
+
+<h2 id="grammar-options">Grammar options</h2>
+
+<p>Joshua can extract three types of grammars: Hiero grammars, GHKM, and SAMT 
grammars.  As described
+on the <a href="file-formats.html">file formats page</a>, all of them are 
encoded into the same file format,
+but they differ in terms of the richness of their nonterminal sets.</p>
+
+<p>Hiero grammars make use of a single nonterminals, and are extracted by 
computing phrases from
+word-based alignments and then subtracting out phrase differences.  More 
detail can be found in
+<a 
href="http://www.mitpressjournals.org/doi/abs/10.1162/coli.2007.33.2.201";>Chiang
 (2007) [PDF]</a>.
+<a href="http://www.isi.edu/%7Emarcu/papers/cr_ghkm_naacl04.pdf";>GHKM</a> (new 
with 5.0) and
+<a href="http://www.cs.cmu.edu/~zollmann/samt/";>SAMT</a> grammars make use of 
a source- or target-side parse
+tree on the training data, differing in the way they extract rules using these 
trees: GHKM extracts
+synchronous tree substitution grammar rules rooted in a subset of the tree 
constituents, whereas
+SAMT projects constituent labels down onto phrases.  SAMT grammars are usually 
many times larger and
+are much slower to decode with, but sometimes increase BLEU score.  Both 
grammar formats are
+extracted with the <a href="thrax.html">Thrax software</a>.</p>
+
+<p>By default, the Joshua pipeline extract a Hiero grammar, but this can be 
altered with the <code class="highlighter-rouge">--type
+(ghkm|samt)</code> flag. For GHKM grammars, the default is to use
+<a 
href="http://www-nlp.stanford.edu/~mgalley/software/stanford-ghkm-latest.tar.gz";>Michel
 Galleyâs extractor</a>,
+but you can also use Mosesâ extractor with <code 
class="highlighter-rouge">--ghkm-extractor moses</code>. Galleyâs extractor 
only outputs
+two features, so the scores tend to be significantly lower than that of 
Mosesâ.</p>
+
+<h2 id="other-high-level-options">Other high-level options</h2>
+
+<p>The following command-line arguments control run-time behavior of multiple 
steps:</p>
+
+<ul>
+  <li>
+    <p><code class="highlighter-rouge">--threads N</code> (1)</p>
+
+    <p>This enables multithreaded operation for a number of steps: alignment 
(with GIZA, max two
+threads), parsing, and decoding (any number of threads)</p>
+  </li>
+  <li>
+    <p><code class="highlighter-rouge">--jobs N</code> (1)</p>
+
+    <p>This enables parallel operation over a cluster using the qsub command.  
This feature is not
+well-documented at this point, but you will likely want to edit the file
+<code 
class="highlighter-rouge">$JOSHUA/scripts/training/parallelize/LocalConfig.pm</code>
 to setup your qsub environment, and may also
+want to pass specific qsub commands via the <code 
class="highlighter-rouge">--qsub-args "ARGS"</code> command.</p>
+  </li>
+</ul>
+
+<h2 id="restarting-failed-runs">Restarting failed runs</h2>
+
+<p>If the pipeline dies, you can restart it with the same command you used the 
first time.  If you
+rerun the pipeline with the exact same invocation as the previous run (or an 
overlapping
+configuration â one that causes the same set of behaviors), you will see 
slightly different
+output compared to what we saw above:</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>[train-copy-en] 
cached, skipping...
+[train-copy-ur] cached, skipping...
+...
+</code></pre>
+</div>
+
+<p>This indicates that the caching module has discovered that the step was 
already computed and thus
+did not need to be rerun.  This feature is quite useful for restarting 
pipeline runs that have
+crashed due to bugs, memory limitations, hardware failures, and the myriad 
other problems that
+plague MT researchers across the world.</p>
+
+<p>Often, a command will die because it was parameterized incorrectly.  For 
example, perhaps the
+decoder ran out of memory.  This allows you to adjust the parameter (e.g., 
<code class="highlighter-rouge">--joshua-mem</code>) and rerun
+the script.  Of course, if you change one of the parameters a step depends on, 
it will trigger a
+rerun, which in turn might trigger further downstream reruns.</p>
+
+<h2 id="a-idsteps--skipping-steps-quitting-early"><a id="steps"></a> Skipping 
steps, quitting early</h2>
+
+<p>You will also find it useful to start the pipeline somewhere other than 
data preparation (for
+example, if you have already-processed data and an alignment, and want to 
begin with building a
+grammar) or to end it prematurely (if, say, you donât have a test set and 
just want to tune a
+model).  This can be accomplished with the <code 
class="highlighter-rouge">--first-step</code> and <code 
class="highlighter-rouge">--last-step</code> flags, which take as
+argument a case-insensitive version of the following steps:</p>
+
+<ul>
+  <li>
+    <p><em>FIRST</em>: Data preparation.  Everything begins with data 
preparation.  This is the default first
+ step, so there is no need to be explicit about it.</p>
+  </li>
+  <li>
+    <p><em>ALIGN</em>: Alignment.  You might want to start here if you want to 
skip data preprocessing.</p>
+  </li>
+  <li>
+    <p><em>PARSE</em>: Parsing.  This is only relevant for building SAMT 
grammars (<code class="highlighter-rouge">--type samt</code>), in which case
+ the target side (<code class="highlighter-rouge">--target</code>) of the 
training data (<code class="highlighter-rouge">--corpus</code>) is parsed 
before building a
+ grammar.</p>
+  </li>
+  <li>
+    <p><em>THRAX</em>: Grammar extraction <a href="thrax.html">with Thrax</a>. 
 If you jump to this step, youâll need to
+ provide an aligned corpus (<code 
class="highlighter-rouge">--alignment</code>) along with your parallel data.  
</p>
+  </li>
+  <li>
+    <p><em>TUNE</em>: Tuning.  The exact tuning method is determined with 
<code class="highlighter-rouge">--tuner {mert,mira,pro}</code>.  With this
+ option, you need to specify a grammar (<code 
class="highlighter-rouge">--grammar</code>) or separate tune (<code 
class="highlighter-rouge">--tune-grammar</code>) and test
+ (<code class="highlighter-rouge">--test-grammar</code>) grammars.  A full 
grammar (<code class="highlighter-rouge">--grammar</code>) will be filtered 
against the relevant
+ tuning or test set unless you specify <code 
class="highlighter-rouge">--no-filter-tm</code>.  If you want a language model 
built from
+ the target side of your training data, youâll also need to pass in the 
training corpus
+ (<code class="highlighter-rouge">--corpus</code>).  You can also specify an 
arbitrary number of additional language models with one or
+ more <code class="highlighter-rouge">--lmfile</code> flags.</p>
+  </li>
+  <li>
+    <p><em>TEST</em>: Testing.  If you have a tuned model file, you can test 
new corpora by passing in a test
+ corpus with references (<code class="highlighter-rouge">--test</code>).  
Youâll need to provide a run name (<code 
class="highlighter-rouge">--name</code>) to store the
+ results of this run, which will be placed under <code 
class="highlighter-rouge">test/NAME</code>.  Youâll also need to provide a
+ Joshua configuration file (<code 
class="highlighter-rouge">--joshua-config</code>), one or more language models 
(<code class="highlighter-rouge">--lmfile</code>), and a
+ grammar (<code class="highlighter-rouge">--grammar</code>); this will be 
filtered to the test data unless you specify
+ <code class="highlighter-rouge">--no-filter-tm</code>) or unless you directly 
provide a filtered test grammar (<code 
class="highlighter-rouge">--test-grammar</code>).</p>
+  </li>
+  <li>
+    <p><em>LAST</em>: The last step.  This is the default target of <code 
class="highlighter-rouge">--last-step</code>.</p>
+  </li>
+</ul>
+
+<p>We now discuss these steps in more detail.</p>
+
+<h3 id="a-idprep--1-data-preparation"><a id="prep"></a> 1. DATA 
PREPARATION</h3>
+
+<p>Data prepare involves doing the following to each of the training data 
(<code class="highlighter-rouge">--corpus</code>), tuning data
+(<code class="highlighter-rouge">--tune</code>), and testing data (<code 
class="highlighter-rouge">--test</code>).  Each of these values is an absolute 
or relative path
+prefix.  To each of these prefixes, a â.â is appended, followed by each of 
SOURCE (<code class="highlighter-rouge">--source</code>) and
+TARGET (<code class="highlighter-rouge">--target</code>), which are file 
extensions identifying the languages.  The SOURCE and TARGET
+files must have the same number of lines.  </p>
+
+<p>For tuning and test data, multiple references are handled automatically.  A 
single reference will
+have the format TUNE.TARGET, while multiple references will have the format 
TUNE.TARGET.NUM, where
+NUM starts at 0 and increments for as many references as there are.</p>
+
+<p>The following processing steps are applied to each file.</p>
+
+<ol>
+  <li>
+    <p><strong>Copying</strong> the files into <code 
class="highlighter-rouge">$RUNDIR/data/TYPE</code>, where TYPE is one of 
âtrainâ, âtuneâ, or âtestâ.
+Multiple <code class="highlighter-rouge">--corpora</code> files are 
concatenated in the order they are specified.  Multiple <code 
class="highlighter-rouge">--tune</code>
+and <code class="highlighter-rouge">--test</code> flags are not currently 
allowed.</p>
+  </li>
+  <li>
+    <p><strong>Normalizing</strong> punctuation and text (e.g., removing extra 
spaces, converting special
+quotations).  There are a few language-specific options that depend on the 
file extension
+matching the <a 
href="http://en.wikipedia.org/wiki/List_of_ISO_639-1_codes";>two-letter ISO 
639-1</a>
+designation.</p>
+  </li>
+  <li>
+    <p><strong>Tokenizing</strong> the data (e.g., separating out punctuation, 
converting brackets).  Again, there
+are language-specific tokenizations for a few languages (English, German, and 
Greek).</p>
+  </li>
+  <li>
+    <p>(Training only) <strong>Removing</strong> all parallel sentences with 
more than <code class="highlighter-rouge">--maxlen</code> tokens on either
+side.  By default, MAXLEN is 50.  To turn this off, specify <code 
class="highlighter-rouge">--maxlen 0</code>.</p>
+  </li>
+  <li>
+    <p><strong>Lowercasing</strong>.</p>
+  </li>
+</ol>
+
+<p>This creates a series of intermediate files which are saved for posterity 
but compressed.  For
+example, you might see</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>data/
+    train/
+        train.en.gz
+        train.tok.en.gz
+        train.tok.50.en.gz
+        train.tok.50.lc.en
+        corpus.en -&gt; train.tok.50.lc.en
+</code></pre>
+</div>
+
+<p>The file âcorpus.LANGâ is a symbolic link to the last file in the 
chain.  </p>
+
+<h2 id="alignment-a-idalignment-">2. ALIGNMENT <a id="alignment"></a></h2>
+
+<p>Alignments are between the parallel corpora at <code 
class="highlighter-rouge">$RUNDIR/data/train/corpus.{SOURCE,TARGET}</code>.  To
+prevent the alignment tables from getting too big, the parallel corpora are 
grouped into files of no
+more than ALIGNER_CHUNK_SIZE blocks (controlled with a parameter below).  The 
last block is folded
+into the penultimate block if it is too small.  These chunked files are all 
created in a
+subdirectory of <code 
class="highlighter-rouge">$RUNDIR/data/train/splits</code>, named <code 
class="highlighter-rouge">corpus.LANG.0</code>, <code 
class="highlighter-rouge">corpus.LANG.1</code>, and so on.</p>
+
+<p>The pipeline parameters affecting alignment are:</p>
+
+<ul>
+  <li>
+    <p><code class="highlighter-rouge">--aligner ALIGNER</code> {giza 
(default), berkeley, jacana}</p>
+
+    <p>Which aligner to use.  The default is <a 
href="http://code.google.com/p/giza-pp/";>GIZA++</a>, but
+<a href="http://code.google.com/p/berkeleyaligner/";>the Berkeley aligner</a> 
can be used instead.  When
+using the Berkeley aligner, youâll want to pay attention to how much memory 
you allocate to it
+with <code class="highlighter-rouge">--aligner-mem</code> (the default is 
10g).</p>
+  </li>
+  <li>
+    <p><code class="highlighter-rouge">--aligner-chunk-size SIZE</code> 
(1,000,000)</p>
+
+    <p>The number of sentence pairs to compute alignments over. The training 
data is split into blocks
+of this size, aligned separately, and then concatenated.</p>
+  </li>
+  <li>
+    <p><code class="highlighter-rouge">--alignment FILE</code></p>
+
+    <p>If you have an already-computed alignment, you can pass that to the 
script using this flag.
+Note that, in this case, you will want to skip data preparation and alignment 
using
+<code class="highlighter-rouge">--first-step thrax</code> (the first step 
after alignment) and also to specify <code 
class="highlighter-rouge">--no-prepare</code> so
+as not to retokenize the data and mess with your alignments.</p>
+
+    <p>The alignment file format is the standard format where 0-indexed 
many-many alignment pairs for a
+sentence are provided on a line, source language first, e.g.,</p>
+
+    <p>0-0 0-1 1-2 1-7 â¦</p>
+
+    <p>This value is required if you start at the grammar extraction step.</p>
+  </li>
+</ul>
+
+<p>When alignment is complete, the alignment file can be found at <code 
class="highlighter-rouge">$RUNDIR/alignments/training.align</code>.
+It is parallel to the training corpora.  There are many files in the <code 
class="highlighter-rouge">alignments/</code> subdirectory that
+contain the output of intermediate steps.</p>
+
+<h3 id="a-idparsing--3-parsing"><a id="parsing"></a> 3. PARSING</h3>
+
+<p>To build SAMT and GHKM grammars (<code class="highlighter-rouge">--type 
samt</code> and <code class="highlighter-rouge">--type ghkm</code>), the target 
side of the
+training data must be parsed. The pipeline assumes your target side will be 
English, and will parse
+it for you using <a href="http://code.google.com/p/berkeleyparser/";>the 
Berkeley parser</a>, which is included.
+If it is not the case that English is your target-side language, the target 
side of your training
+data (found at CORPUS.TARGET) must already be parsed in PTB format.  The 
pipeline will notice that
+it is parsed and will not reparse it.</p>
+
+<p>Parsing is affected by both the <code class="highlighter-rouge">--threads 
N</code> and <code class="highlighter-rouge">--jobs N</code> options.  The 
former runs the parser in
+multithreaded mode, while the latter distributes the runs across as cluster 
(and requires some
+configuration, not yet documented).  The options are mutually exclusive.</p>
+
+<p>Once the parsing is complete, there will be two parsed files:</p>
+
+<ul>
+  <li><code 
class="highlighter-rouge">$RUNDIR/data/train/corpus.en.parsed</code>: this is 
the mixed-case file that was parsed.</li>
+  <li><code 
class="highlighter-rouge">$RUNDIR/data/train/corpus.parsed.en</code>: this is a 
leaf-lowercased version of the above file used for
+grammar extraction.</li>
+</ul>
+
+<h2 id="thrax-grammar-extraction-a-idtm-">4. THRAX (grammar extraction) <a 
id="tm"></a></h2>
+
+<p>The grammar extraction step takes three pieces of data: (1) the 
source-language training corpus, (2)
+the target-language training corpus (parsed, if an SAMT grammar is being 
extracted), and (3) the
+alignment file.  From these, it computes a synchronous context-free grammar.  
If you already have a
+grammar and wish to skip this step, you can do so passing the grammar with the 
<code class="highlighter-rouge">--grammar
+/path/to/grammar</code> flag.</p>
+
+<p>The main variable in grammar extraction is Hadoop.  If you have a Hadoop 
installation, simply ensure
+that the environment variable <code class="highlighter-rouge">$HADOOP</code> 
is defined, and Thrax will seamlessly use it.  If you <em>do
+not</em> have a Hadoop installation, the pipeline will roll out out for you, 
running Hadoop in
+standalone mode (this mode is triggered when <code 
class="highlighter-rouge">$HADOOP</code> is undefined).  Theoretically, any 
grammar
+extractable on a full Hadoop cluster should be extractable in standalone mode, 
if you are patient
+enough; in practice, you probably are not patient enough, and will be limited 
to smaller
+datasets. You may also run into problems with disk space; Hadoop uses a lot 
(use <code class="highlighter-rouge">--tmp
+/path/to/tmp</code> to specify an alternate place for temporary data; we 
suggest you use a local disk
+partition with tens or hundreds of gigabytes free, and not an NFS partition).  
Setting up your own
+Hadoop cluster is not too difficult a chore; in particular, you may find it 
helpful to install a
+<a 
href="http://hadoop.apache.org/common/docs/r0.20.2/quickstart.html";>pseudo-distributed
 version of Hadoop</a>.
+In our experience, this works fine, but you should note the following 
caveats:</p>
+
+<ul>
+  <li>It is of crucial importance that you have enough physical disks.  We 
have found that having too
+few, or too slow of disks, results in a whole host of seemingly unrelated 
issues that are hard to
+resolve, such as timeouts.  </li>
+  <li>NFS filesystems can cause lots of problems.  You should really try to 
install physical disks that
+are dedicated to Hadoop scratch space.</li>
+</ul>
+
+<p>Here are some flags relevant to Hadoop and grammar extraction with 
Thrax:</p>
+
+<ul>
+  <li>
+    <p><code class="highlighter-rouge">--hadoop /path/to/hadoop</code></p>
+
+    <p>This sets the location of Hadoop (overriding the environment variable 
<code class="highlighter-rouge">$HADOOP</code>)</p>
+  </li>
+  <li>
+    <p><code class="highlighter-rouge">--hadoop-mem MEM</code> (2g)</p>
+
+    <p>This alters the amount of memory available to Hadoop mappers (passed 
via the
+<code class="highlighter-rouge">mapred.child.java.opts</code> options).</p>
+  </li>
+  <li>
+    <p><code class="highlighter-rouge">--thrax-conf FILE</code></p>
+
+    <p>Use the provided Thrax configuration file instead of the 
(grammar-specific) default.  The Thrax
+ templates are located at <code 
class="highlighter-rouge">$JOSHUA/scripts/training/templates/thrax-TYPE.conf</code>,
 where TYPE is one
+ of âhieroâ or âsamtâ.</p>
+  </li>
+</ul>
+
+<p>When the grammar is extracted, it is compressed and placed at <code 
class="highlighter-rouge">$RUNDIR/grammar.gz</code>.</p>
+
+<h2 id="a-idlm--5-language-model"><a id="lm"></a> 5. Language model</h2>
+
+<p>Before tuning can take place, a language model is needed.  A language model 
is always built from the
+target side of the training corpus unless <code 
class="highlighter-rouge">--no-corpus-lm</code> is specified.  In addition, you 
can
+provide other language models (any number of them) with the <code 
class="highlighter-rouge">--lmfile FILE</code> argument.  Other
+arguments are as follows.</p>
+
+<ul>
+  <li>
+    <p><code class="highlighter-rouge">--lm</code> {kenlm (default), 
berkeleylm}</p>
+
+    <p>This determines the language model code that will be used when 
decoding.  These implementations
+are described in their respective papers (PDFs:
+<a href="http://kheafield.com/professional/avenue/kenlm.pdf";>KenLM</a>,
+<a 
href="http://nlp.cs.berkeley.edu/pubs/Pauls-Klein_2011_LM_paper.pdf";>BerkeleyLM</a>).
 KenLM is written in
+C++ and requires a pass through the JNI, but is recommended because it 
supports left-state minimization.</p>
+  </li>
+  <li>
+    <p><code class="highlighter-rouge">--lmfile FILE</code></p>
+
+    <p>Specifies a pre-built language model to use when decoding.  This 
language model can be in ARPA
+format, or in KenLM format when using KenLM or BerkeleyLM format when using 
that format.</p>
+  </li>
+  <li>
+    <p><code class="highlighter-rouge">--lm-gen</code> {kenlm (default), 
srilm, berkeleylm}, <code class="highlighter-rouge">--buildlm-mem MEM</code>, 
<code class="highlighter-rouge">--witten-bell</code></p>
+
+    <p>At the tuning step, an LM is built from the target side of the training 
data (unless
+<code class="highlighter-rouge">--no-corpus-lm</code> is specified).  This 
controls which code is used to build it.  The default is a
+KenLMâs <a href="http://kheafield.com/code/kenlm/estimation/";>lmplz</a>, and 
is strongly recommended.</p>
+
+    <p>If SRILM is used, it is called with the following arguments:</p>
+
+    <div class="highlighter-rouge"><pre class="highlight"><code>  
$SRILM/bin/i686-m64/ngram-count -interpolate SMOOTHING -order 5 -text 
TRAINING-DATA -unk -lm lm.gz
+</code></pre>
+    </div>
+
+    <p>Where SMOOTHING is <code class="highlighter-rouge">-kndiscount</code>, 
or <code class="highlighter-rouge">-wbdiscount</code> if <code 
class="highlighter-rouge">--witten-bell</code> is passed to the pipeline.</p>
+
+    <p><a 
href="http://code.google.com/p/berkeleylm/source/browse/trunk/src/edu/berkeley/nlp/lm/io/MakeKneserNeyArpaFromText.java";>BerkeleyLM
 java class</a>
+is also available. It computes a Kneser-Ney LM with a constant discounting 
(0.75) and no count
+thresholding.  The flag <code class="highlighter-rouge">--buildlm-mem</code> 
can be used to control how much memory is allocated to the
+Java process.  The default is â2gâ, but you will want to increase it for 
larger language models.</p>
+
+    <p>A language model built from the target side of the training data is 
placed at <code class="highlighter-rouge">$RUNDIR/lm.gz</code>.  </p>
+  </li>
+</ul>
+
+<h2 id="interlude-decoder-arguments">Interlude: decoder arguments</h2>
+
+<p>Running the decoder is done in both the tuning stage and the testing stage. 
 A critical point is
+that you have to give the decoder enough memory to run.  Joshua can be very 
memory-intensive, in
+particular when decoding with large grammars and large language models.  The 
default amount of
+memory is 3100m, which is likely not enough (especially if you are decoding 
with SAMT grammar).  You
+can alter the amount of memory for Joshua using the <code 
class="highlighter-rouge">--joshua-mem MEM</code> argument, where MEM is a Java
+memory specification (passed to its <code 
class="highlighter-rouge">-Xmx</code> flag).</p>
+
+<h2 id="a-idtuning--6-tuning"><a id="tuning"></a> 6. TUNING</h2>
+
+<p>Two optimizers are provided with Joshua: MERT and PRO (<code 
class="highlighter-rouge">--tuner {mert,pro}</code>).  If Moses is
+installed, you can also use Cherry &amp; Fosterâs k-best batch MIRA (<code 
class="highlighter-rouge">--tuner mira</code>, recommended).
+Tuning is run till convergence in the <code 
class="highlighter-rouge">$RUNDIR/tune/N</code> directory, where N is the 
tuning instance.
+By default, tuning is run just once, but the pipeline supports running the 
optimizer an arbitrary
+number of times due to <a 
href="http://www.youtube.com/watch?v=BOa3XDkgf0Y";>recent work</a> pointing out 
the
+variance of tuning procedures in machine translation, in particular MERT.  
This can be activated
+with <code class="highlighter-rouge">--optimizer-runs N</code>.  Each run can 
be found in a directory <code 
class="highlighter-rouge">$RUNDIR/tune/N</code>.</p>
+
+<p>When tuning is finished, each final configuration file can be found at 
either</p>
+
+<div class="highlighter-rouge"><pre 
class="highlight"><code>$RUNDIR/tune/N/joshua.config.final
+</code></pre>
+</div>
+
+<p>where N varies from 1..<code 
class="highlighter-rouge">--optimizer-runs</code>.</p>
+
+<h2 id="a-idtesting--7-testing"><a id="testing"></a> 7. Testing</h2>
+
+<p>For each of the tuner runs, Joshua takes the tuner output file and decodes 
the test set.  If you
+like, you can also apply minimum Bayes-risk decoding to the decoder output 
with <code class="highlighter-rouge">--mbr</code>.  This
+usually yields about 0.3 - 0.5 BLEU points, but is time-consuming.</p>
+
+<p>After decoding the test set with each set of tuned weights, Joshua computes 
the mean BLEU score,
+writes it to <code class="highlighter-rouge">$RUNDIR/test/final-bleu</code>, 
and cats it. It also writes a file
+<code class="highlighter-rouge">$RUNDIR/test/final-times</code> containing a 
summary of runtime information. Thatâs the end of the pipeline!</p>
+
+<p>Joshua also supports decoding further test sets.  This is enabled by 
rerunning the pipeline with a
+number of arguments:</p>
+
+<ul>
+  <li>
+    <p><code class="highlighter-rouge">--first-step TEST</code></p>
+
+    <p>This tells the decoder to start at the test step.</p>
+  </li>
+  <li>
+    <p><code class="highlighter-rouge">--name NAME</code></p>
+
+    <p>A name is needed to distinguish this test set from the previous ones.  
Output for this test run
+will be stored at <code class="highlighter-rouge">$RUNDIR/test/NAME</code>.</p>
+  </li>
+  <li>
+    <p><code class="highlighter-rouge">--joshua-config CONFIG</code></p>
+
+    <p>A tuned parameter file is required.  This file will be the output of 
some prior tuning run.
+Necessary pathnames and so on will be adjusted.</p>
+  </li>
+</ul>
+
+<h2 id="a-idanalysis-8-analysis"><a id="analysis"> 8. ANALYSIS</a></h2>
+
+<p>If you have used the suggested layout, with a number of related runs all 
contained in a common
+directory with sequential numbers, you can use the script <code 
class="highlighter-rouge">$JOSHUA/scripts/training/summarize.pl</code> to
+display a summary of the mean BLEU scores from all runs, along with the text 
you placed in the run
+README file (using the pipelineâs <code class="highlighter-rouge">--readme 
TEXT</code> flag).</p>
+
+<h2 id="common-use-cases-and-pitfalls">COMMON USE CASES AND PITFALLS</h2>
+
+<ul>
+  <li>
+    <p>If the pipeline dies at the âthrax-runâ stage with an error like 
the following:</p>
+
+    <div class="highlighter-rouge"><pre class="highlight"><code>JOB FAILED 
(return code 1) 
+hadoop/bin/hadoop: line 47: 
+/some/path/to/a/directory/hadoop/bin/hadoop-config.sh: No such file or 
directory 
+Exception in thread "main" java.lang.NoClassDefFoundError: 
org/apache/hadoop/fs/FsShell 
+Caused by: java.lang.ClassNotFoundException: org.apache.hadoop.fs.FsShell 
+</code></pre>
+    </div>
+
+    <p>This occurs if the <code class="highlighter-rouge">$HADOOP</code> 
environment variable is set but does not point to a working
+Hadoop installation.  To fix it, make sure to unset the variable:</p>
+
+    <div class="highlighter-rouge"><pre class="highlight"><code># in bash
+unset HADOOP
+</code></pre>
+    </div>
+
+    <p>and then rerun the pipeline with the same invocation.</p>
+  </li>
+  <li>
+    <p>Memory usage is a major consideration in decoding with Joshua and 
hierarchical grammars.  In
+particular, SAMT grammars often require a large amount of memory.  Many steps 
have been taken to
+reduce memory usage, including beam settings and test-set- and sentence-level 
filtering of
+grammars.  However, memory usage can still be in the tens of gigabytes.</p>
+
+    <p>To accommodate this kind of variation, the pipeline script allows you 
to specify both (a) the
+amount of memory used by the Joshua decoder instance and (b) the amount of 
memory required of
+nodes obtained by the qsub command.  These are accomplished with the <code 
class="highlighter-rouge">--joshua-mem</code> MEM and
+<code class="highlighter-rouge">--qsub-args</code> ARGS commands.  For 
example,</p>
+
+    <div class="highlighter-rouge"><pre class="highlight"><code>pipeline.pl 
--joshua-mem 32g --qsub-args "-l pvmem=32g -q himem.q" ...
+</code></pre>
+    </div>
+
+    <p>Also, should Thrax fail, it might be due to a memory restriction. By 
default, Thrax requests 2 GB
+from the Hadoop server. If more memory is needed, set the memory requirement 
with the
+<code class="highlighter-rouge">--hadoop-mem</code> in the same way as the 
<code class="highlighter-rouge">--joshua-mem</code> option is used.</p>
+  </li>
+  <li>
+    <p>Other pitfalls and advice will be added as it is discovered.</p>
+  </li>
+</ul>
+
+<h2 id="feedback">FEEDBACK</h2>
+
+<p>Please email [email protected] with problems or 
suggestions.</p>
+
+
+
+        </div>
+      </div>
+    </div> <!-- /container -->
+
+    <!-- Le javascript
+    ================================================== -->
+    <!-- Placed at the end of the document so the pages load faster -->
+    <script src="bootstrap/js/jquery.js"></script>
+    <script src="bootstrap/js/bootstrap-transition.js"></script>
+    <script src="bootstrap/js/bootstrap-alert.js"></script>
+    <script src="bootstrap/js/bootstrap-modal.js"></script>
+    <script src="bootstrap/js/bootstrap-dropdown.js"></script>
+    <script src="bootstrap/js/bootstrap-scrollspy.js"></script>
+    <script src="bootstrap/js/bootstrap-tab.js"></script>
+    <script src="bootstrap/js/bootstrap-tooltip.js"></script>
+    <script src="bootstrap/js/bootstrap-popover.js"></script>
+    <script src="bootstrap/js/bootstrap-button.js"></script>
+    <script src="bootstrap/js/bootstrap-collapse.js"></script>
+    <script src="bootstrap/js/bootstrap-carousel.js"></script>
+    <script src="bootstrap/js/bootstrap-typeahead.js"></script>
+
+    <!-- Start of StatCounter Code for Default Guide -->
+    <script type="text/javascript">
+      var sc_project=8264132; 
+      var sc_invisible=1; 
+      var sc_security="4b97fe2d"; 
+    </script>
+    <script type="text/javascript" 
src="http://www.statcounter.com/counter/counter.js";></script>
+    <noscript>
+      <div class="statcounter">
+        <a title="hit counter joomla" 
+           href="http://statcounter.com/joomla/";
+           target="_blank">
+          <img class="statcounter"
+               src="http://c.statcounter.com/8264132/0/4b97fe2d/1/";
+               alt="hit counter joomla" />
+        </a>
+      </div>
+    </noscript>
+    <!-- End of StatCounter Code for Default Guide -->
+
+  </body>
+</html>


http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/5.0/pipeline.md
----------------------------------------------------------------------
diff --git a/5.0/pipeline.md b/5.0/pipeline.md
deleted file mode 100644
index fbe052d..0000000
--- a/5.0/pipeline.md
+++ /dev/null
@@ -1,640 +0,0 @@
----
-layout: default
-category: links
-title: The Joshua Pipeline
----
-
-This page describes the Joshua pipeline script, which manages the complexity 
of training and
-evaluating machine translation systems.  The pipeline eases the pain of two 
related tasks in
-statistical machine translation (SMT) research:
-
-- Training SMT systems involves a complicated process of interacting steps 
that are
-  time-consuming and prone to failure.
-
-- Developing and testing new techniques requires varying parameters at 
different points in the
-  pipeline. Earlier results (which are often expensive) need not be recomputed.
-
-To facilitate these tasks, the pipeline script:
-
-- Runs the complete SMT pipeline, from corpus normalization and tokenization, 
through alignment,
-  model building, tuning, test-set decoding, and evaluation.
-
-- Caches the results of intermediate steps (using robust SHA-1 checksums on 
dependencies), so the
-  pipeline can be debugged or shared across similar runs while doing away with 
time spent
-  recomputing expensive steps.
- 
-- Allows you to jump into and out of the pipeline at a set of predefined 
places (e.g., the alignment
-  stage), so long as you provide the missing dependencies.
-
-The Joshua pipeline script is designed in the spirit of Moses' 
`train-model.pl`, and shares many of
-its features.  It is not as extensive, however, as Moses'
-[Experiment Management 
System](http://www.statmt.org/moses/?n=FactoredTraining.EMS), which allows
-the user to define arbitrary execution dependency graphs.
-
-## Installation
-
-The pipeline has no *required* external dependencies.  However, it has support 
for a number of
-external packages, some of which are included with Joshua.
-
--  [GIZA++](http://code.google.com/p/giza-pp/) (included)
-
-   GIZA++ is the default aligner.  It is included with Joshua, and should 
compile successfully when
-   you typed `ant` from the Joshua root directory.  It is not required because 
you can use the
-   (included) Berkeley aligner (`--aligner berkeley`). We have recently also 
provided support
-   for the [Jacana-XY 
aligner](http://code.google.com/p/jacana-xy/wiki/JacanaXY) (`--aligner
-   jacana`). 
-
--  [Hadoop](http://hadoop.apache.org/) (included)
-
-   The pipeline uses the [Thrax grammar extractor](thrax.html), which is built 
on Hadoop.  If you
-   have a Hadoop installation, simply ensure that the `$HADOOP` environment 
variable is defined, and
-   the pipeline will use it automatically at the grammar extraction step.  If 
you are going to
-   attempt to extract very large grammars, it is best to have a good-sized 
Hadoop installation.
-   
-   (If you do not have a Hadoop installation, you might consider setting one 
up.  Hadoop can be
-   installed in a
-   
["pseudo-distributed"](http://hadoop.apache.org/common/docs/r0.20.2/quickstart.html#PseudoDistributed)
-   mode that allows it to use just a few machines or a number of processors on 
a single machine.
-   The main issue is to ensure that there are a lot of independent physical 
disks, since in our
-   experience Hadoop starts to exhibit lots of hard-to-trace problems if there 
is too much demand on
-   the disks.)
-   
-   If you don't have a Hadoop installation, there are still no worries.  The 
pipeline will unroll a
-   standalone installation and use it to extract your grammar.  This behavior 
will be triggered if
-   `$HADOOP` is undefined.
-   
--  [SRILM](http://www.speech.sri.com/projects/srilm/) (not included)
-
-   By default, the pipeline uses a Java program from the
-   [Berkeley LM](http://code.google.com/p/berkeleylm/) package that constructs 
an
-   Kneser-Ney-smoothed language model in ARPA format from the target side of 
your training data.  If
-   you wish to use SRILM instead, you need to do the following:
-   
-   1. Install SRILM and set the `$SRILM` environment variable to point to its 
installed location.
-   1. Add the `--lm-gen srilm` flag to your pipeline invocation.
-   
-   More information on this is available in the [LM building section of the 
pipeline](#lm).  SRILM
-   is not used for representing language models during decoding (and in fact 
is not supported,
-   having been supplanted by [KenLM](http://kheafield.com/code/kenlm/) (the 
default) and
-   BerkeleyLM).
-
--  [Moses](http://statmt.org/moses/) (not included)
-
-Make sure that the environment variable `$JOSHUA` is defined, and you should 
be all set.
-
-## A basic pipeline run
-
-The pipeline takes a set of inputs (training, tuning, and test data), and 
creates a set of
-intermediate files in the *run directory*.  By default, the run directory is 
the current directory,
-but it can be changed with the `--rundir` parameter.
-
-For this quick start, we will be working with the example that can be found in
-`$JOSHUA/examples/pipeline`.  This example contains 1,000 sentences of 
Urdu-English data (the full
-dataset is available as part of the
-[Indian languages parallel corpora](/indian-parallel-corpora/) with
-100-sentence tuning and test sets with four references each.
-
-Running the pipeline requires two main steps: data preparation and invocation.
-
-1. Prepare your data.  The pipeline script needs to be told where to find the 
raw training, tuning,
-   and test data.  A good convention is to place these files in an input/ 
subdirectory of your run's
-   working directory (NOTE: do not use `data/`, since a directory of that name 
is created and used
-   by the pipeline itself for storing processed files).  The expected format 
(for each of training,
-   tuning, and test) is a pair of files that share a common path prefix and 
are distinguished by
-   their extension, e.g.,
-
-       input/
-             train.SOURCE
-             train.TARGET
-             tune.SOURCE
-             tune.TARGET
-             test.SOURCE
-             test.TARGET
-
-   These files should be parallel at the sentence level (with one sentence per 
line), should be in
-   UTF-8, and should be untokenized (tokenization occurs in the pipeline).  
SOURCE and TARGET denote
-   variables that should be replaced with the actual target and source 
language abbreviations (e.g.,
-   "ur" and "en").
-   
-1. Run the pipeline.  The following is the minimal invocation to run the 
complete pipeline:
-
-       $JOSHUA/bin/pipeline.pl  \
-         --corpus input/train   \
-         --tune input/tune      \
-         --test input/devtest   \
-         --source SOURCE        \
-         --target TARGET
-
-   The `--corpus`, `--tune`, and `--test` flags define file prefixes that are 
concatened with the
-   language extensions given by `--target` and `--source` (with a "." in 
between).  Note the
-   correspondences with the files defined in the first step above.  The 
prefixes can be either
-   absolute or relative pathnames.  This particular invocation assumes that a 
subdirectory `input/`
-   exists in the current directory, that you are translating from a language 
identified "ur"
-   extension to a language identified by the "en" extension, that the training 
data can be found at
-   `input/train.en` and `input/train.ur`, and so on.
-
-*Don't* run the pipeline directly from `$JOSHUA`. We recommend creating a run 
directory somewhere
- else to contain all of your experiments in some other location. The advantage 
to this (apart from
- not clobbering part of the Joshua install) is that Joshua provides support 
scripts for visualizing
- the results of a series of experiments that only work if you
-
-Assuming no problems arise, this command will run the complete pipeline in 
about 20 minutes,
-producing BLEU scores at the end.  As it runs, you will see output that looks 
like the following:
-   
-    [train-copy-en] rebuilding...
-      dep=/Users/post/code/joshua/test/pipeline/input/train.en 
-      dep=data/train/train.en.gz [NOT FOUND]
-      cmd=cat /Users/post/code/joshua/test/pipeline/input/train.en | gzip -9n 
> data/train/train.en.gz
-      took 0 seconds (0s)
-    [train-copy-ur] rebuilding...
-      dep=/Users/post/code/joshua/test/pipeline/input/train.ur 
-      dep=data/train/train.ur.gz [NOT FOUND]
-      cmd=cat /Users/post/code/joshua/test/pipeline/input/train.ur | gzip -9n 
> data/train/train.ur.gz
-      took 0 seconds (0s)
-    ...
-   
-And in the current directory, you will see the following files (among other 
intermediate files
-generated by the individual sub-steps).
-   
-    data/
-        train/
-            corpus.ur
-            corpus.en
-            thrax-input-file
-        tune/
-            tune.tok.lc.ur
-            tune.tok.lc.en
-            grammar.filtered.gz
-            grammar.glue
-        test/
-            test.tok.lc.ur
-            test.tok.lc.en
-            grammar.filtered.gz
-            grammar.glue
-    alignments/
-        0/
-            [giza/berkeley aligner output files]
-        training.align
-    thrax-hiero.conf
-    thrax.log
-    grammar.gz
-    lm.gz
-    tune/
-        1/
-            decoder_command
-            joshua.config
-            params.txt
-            joshua.log
-            mert.log
-            joshua.config.ZMERT.final
-        final-bleu
-
-These files will be described in more detail in subsequent sections of this 
tutorial.
-
-Another useful flag is the `--rundir DIR` flag, which chdir()s to the 
specified directory before
-running the pipeline.  By default the rundir is the current directory.  
Changing it can be useful
-for organizing related pipeline runs.  Relative paths specified to other flags 
(e.g., to `--corpus`
-or `--lmfile`) are relative to the directory the pipeline was called *from*, 
not the rundir itself
-(unless they happen to be the same, of course).
-
-The complete pipeline comprises many tens of small steps, which can be grouped 
together into a set
-of traditional pipeline tasks:
-   
-1. [Data preparation](#prep)
-1. [Alignment](#alignment)
-1. [Parsing](#parsing) (syntax-based grammars only)
-1. [Grammar extraction](#tm)
-1. [Language model building](#lm)
-1. [Tuning](#tuning)
-1. [Testing](#testing)
-1. [Analysis](#analysis)
-
-These steps are discussed below, after a few intervening sections about 
high-level details of the
-pipeline.
-
-## Managing groups of experiments
-
-The real utility of the pipeline comes when you use it to manage groups of 
experiments. Typically,
-there is a held-out test set, and we want to vary a number of training 
parameters to determine what
-effect this has on BLEU scores or some other metric. Joshua comes with a script
-`$JOSHUA/scripts/training/summarize.pl` that collects information from a group 
of runs and reports
-them to you. This script works so long as you organize your runs as follows:
-
-1. Your runs should be grouped together in a root directory, which I'll call 
`$RUNDIR`.
-
-2. For comparison purposes, the runs should all be evaluated on the same test 
set.
-
-3. Each run in the run group should be in its own numbered directory, shown 
with the files used by
-the summarize script:
-
-       $RUNDIR/
-           1/
-               README.txt
-               test/
-                   final-bleu
-                   final-times
-               [other files]
-           2/
-               README.txt
-               ...
-               
-You can get such directories using the `--rundir N` flag to the pipeline. 
-
-Run directories can build off each other. For example, `1/` might contain a 
complete baseline
-run. If you wanted to just change the tuner, you don't need to rerun the 
aligner and model builder,
-so you can reuse the results by supplying the second run with the information 
it needs that was
-computed in step 1:
-
-    $JOSHUA/bin/pipeline.pl \
-      --first-step tune \
-      --grammar 1/grammar.gz \
-      ...
-      
-More details are below.
-
-## Grammar options
-
-Joshua can extract three types of grammars: Hiero grammars, GHKM, and SAMT 
grammars.  As described
-on the [file formats page](file-formats.html), all of them are encoded into 
the same file format,
-but they differ in terms of the richness of their nonterminal sets.
-
-Hiero grammars make use of a single nonterminals, and are extracted by 
computing phrases from
-word-based alignments and then subtracting out phrase differences.  More 
detail can be found in
-[Chiang (2007) 
[PDF]](http://www.mitpressjournals.org/doi/abs/10.1162/coli.2007.33.2.201).
-[GHKM](http://www.isi.edu/%7Emarcu/papers/cr_ghkm_naacl04.pdf) (new with 5.0) 
and
-[SAMT](http://www.cs.cmu.edu/~zollmann/samt/) grammars make use of a source- 
or target-side parse
-tree on the training data, differing in the way they extract rules using these 
trees: GHKM extracts
-synchronous tree substitution grammar rules rooted in a subset of the tree 
constituents, whereas
-SAMT projects constituent labels down onto phrases.  SAMT grammars are usually 
many times larger and
-are much slower to decode with, but sometimes increase BLEU score.  Both 
grammar formats are
-extracted with the [Thrax software](thrax.html).
-
-By default, the Joshua pipeline extract a Hiero grammar, but this can be 
altered with the `--type
-(ghkm|samt)` flag. For GHKM grammars, the default is to use
-[Michel Galley's 
extractor](http://www-nlp.stanford.edu/~mgalley/software/stanford-ghkm-latest.tar.gz),
-but you can also use Moses' extractor with `--ghkm-extractor moses`. Galley's 
extractor only outputs
-two features, so the scores tend to be significantly lower than that of Moses'.
-
-## Other high-level options
-
-The following command-line arguments control run-time behavior of multiple 
steps:
-
-- `--threads N` (1)
-
-  This enables multithreaded operation for a number of steps: alignment (with 
GIZA, max two
-  threads), parsing, and decoding (any number of threads)
-  
-- `--jobs N` (1)
-
-  This enables parallel operation over a cluster using the qsub command.  This 
feature is not
-  well-documented at this point, but you will likely want to edit the file
-  `$JOSHUA/scripts/training/parallelize/LocalConfig.pm` to setup your qsub 
environment, and may also
-  want to pass specific qsub commands via the `--qsub-args "ARGS"` command.
-
-## Restarting failed runs
-
-If the pipeline dies, you can restart it with the same command you used the 
first time.  If you
-rerun the pipeline with the exact same invocation as the previous run (or an 
overlapping
-configuration -- one that causes the same set of behaviors), you will see 
slightly different
-output compared to what we saw above:
-
-    [train-copy-en] cached, skipping...
-    [train-copy-ur] cached, skipping...
-    ...
-
-This indicates that the caching module has discovered that the step was 
already computed and thus
-did not need to be rerun.  This feature is quite useful for restarting 
pipeline runs that have
-crashed due to bugs, memory limitations, hardware failures, and the myriad 
other problems that
-plague MT researchers across the world.
-
-Often, a command will die because it was parameterized incorrectly.  For 
example, perhaps the
-decoder ran out of memory.  This allows you to adjust the parameter (e.g., 
`--joshua-mem`) and rerun
-the script.  Of course, if you change one of the parameters a step depends on, 
it will trigger a
-rerun, which in turn might trigger further downstream reruns.
-   
-## <a id="steps" /> Skipping steps, quitting early
-
-You will also find it useful to start the pipeline somewhere other than data 
preparation (for
-example, if you have already-processed data and an alignment, and want to 
begin with building a
-grammar) or to end it prematurely (if, say, you don't have a test set and just 
want to tune a
-model).  This can be accomplished with the `--first-step` and `--last-step` 
flags, which take as
-argument a case-insensitive version of the following steps:
-
-- *FIRST*: Data preparation.  Everything begins with data preparation.  This 
is the default first
-   step, so there is no need to be explicit about it.
-
-- *ALIGN*: Alignment.  You might want to start here if you want to skip data 
preprocessing.
-
-- *PARSE*: Parsing.  This is only relevant for building SAMT grammars (`--type 
samt`), in which case
-   the target side (`--target`) of the training data (`--corpus`) is parsed 
before building a
-   grammar.
-
-- *THRAX*: Grammar extraction [with Thrax](thrax.html).  If you jump to this 
step, you'll need to
-   provide an aligned corpus (`--alignment`) along with your parallel data.  
-
-- *TUNE*: Tuning.  The exact tuning method is determined with `--tuner 
{mert,mira,pro}`.  With this
-   option, you need to specify a grammar (`--grammar`) or separate tune 
(`--tune-grammar`) and test
-   (`--test-grammar`) grammars.  A full grammar (`--grammar`) will be filtered 
against the relevant
-   tuning or test set unless you specify `--no-filter-tm`.  If you want a 
language model built from
-   the target side of your training data, you'll also need to pass in the 
training corpus
-   (`--corpus`).  You can also specify an arbitrary number of additional 
language models with one or
-   more `--lmfile` flags.
-
-- *TEST*: Testing.  If you have a tuned model file, you can test new corpora 
by passing in a test
-   corpus with references (`--test`).  You'll need to provide a run name 
(`--name`) to store the
-   results of this run, which will be placed under `test/NAME`.  You'll also 
need to provide a
-   Joshua configuration file (`--joshua-config`), one or more language models 
(`--lmfile`), and a
-   grammar (`--grammar`); this will be filtered to the test data unless you 
specify
-   `--no-filter-tm`) or unless you directly provide a filtered test grammar 
(`--test-grammar`).
-
-- *LAST*: The last step.  This is the default target of `--last-step`.
-
-We now discuss these steps in more detail.
-
-### <a id="prep" /> 1. DATA PREPARATION
-
-Data prepare involves doing the following to each of the training data 
(`--corpus`), tuning data
-(`--tune`), and testing data (`--test`).  Each of these values is an absolute 
or relative path
-prefix.  To each of these prefixes, a "." is appended, followed by each of 
SOURCE (`--source`) and
-TARGET (`--target`), which are file extensions identifying the languages.  The 
SOURCE and TARGET
-files must have the same number of lines.  
-
-For tuning and test data, multiple references are handled automatically.  A 
single reference will
-have the format TUNE.TARGET, while multiple references will have the format 
TUNE.TARGET.NUM, where
-NUM starts at 0 and increments for as many references as there are.
-
-The following processing steps are applied to each file.
-
-1.  **Copying** the files into `$RUNDIR/data/TYPE`, where TYPE is one of 
"train", "tune", or "test".
-    Multiple `--corpora` files are concatenated in the order they are 
specified.  Multiple `--tune`
-    and `--test` flags are not currently allowed.
-    
-1.  **Normalizing** punctuation and text (e.g., removing extra spaces, 
converting special
-    quotations).  There are a few language-specific options that depend on the 
file extension
-    matching the [two-letter ISO 
639-1](http://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)
-    designation.
-
-1.  **Tokenizing** the data (e.g., separating out punctuation, converting 
brackets).  Again, there
-    are language-specific tokenizations for a few languages (English, German, 
and Greek).
-
-1.  (Training only) **Removing** all parallel sentences with more than 
`--maxlen` tokens on either
-    side.  By default, MAXLEN is 50.  To turn this off, specify `--maxlen 0`.
-
-1.  **Lowercasing**.
-
-This creates a series of intermediate files which are saved for posterity but 
compressed.  For
-example, you might see
-
-    data/
-        train/
-            train.en.gz
-            train.tok.en.gz
-            train.tok.50.en.gz
-            train.tok.50.lc.en
-            corpus.en -> train.tok.50.lc.en
-
-The file "corpus.LANG" is a symbolic link to the last file in the chain.  
-
-## 2. ALIGNMENT <a id="alignment" />
-
-Alignments are between the parallel corpora at 
`$RUNDIR/data/train/corpus.{SOURCE,TARGET}`.  To
-prevent the alignment tables from getting too big, the parallel corpora are 
grouped into files of no
-more than ALIGNER\_CHUNK\_SIZE blocks (controlled with a parameter below).  
The last block is folded
-into the penultimate block if it is too small.  These chunked files are all 
created in a
-subdirectory of `$RUNDIR/data/train/splits`, named `corpus.LANG.0`, 
`corpus.LANG.1`, and so on.
-
-The pipeline parameters affecting alignment are:
-
--   `--aligner ALIGNER` {giza (default), berkeley, jacana}
-
-    Which aligner to use.  The default is 
[GIZA++](http://code.google.com/p/giza-pp/), but
-    [the Berkeley aligner](http://code.google.com/p/berkeleyaligner/) can be 
used instead.  When
-    using the Berkeley aligner, you'll want to pay attention to how much 
memory you allocate to it
-    with `--aligner-mem` (the default is 10g).
-
--   `--aligner-chunk-size SIZE` (1,000,000)
-
-    The number of sentence pairs to compute alignments over. The training data 
is split into blocks
-    of this size, aligned separately, and then concatenated.
-    
--   `--alignment FILE`
-
-    If you have an already-computed alignment, you can pass that to the script 
using this flag.
-    Note that, in this case, you will want to skip data preparation and 
alignment using
-    `--first-step thrax` (the first step after alignment) and also to specify 
`--no-prepare` so
-    as not to retokenize the data and mess with your alignments.
-    
-    The alignment file format is the standard format where 0-indexed many-many 
alignment pairs for a
-    sentence are provided on a line, source language first, e.g.,
-
-      0-0 0-1 1-2 1-7 ...
-
-    This value is required if you start at the grammar extraction step.
-
-When alignment is complete, the alignment file can be found at 
`$RUNDIR/alignments/training.align`.
-It is parallel to the training corpora.  There are many files in the 
`alignments/` subdirectory that
-contain the output of intermediate steps.
-
-### <a id="parsing" /> 3. PARSING
-
-To build SAMT and GHKM grammars (`--type samt` and `--type ghkm`), the target 
side of the
-training data must be parsed. The pipeline assumes your target side will be 
English, and will parse
-it for you using [the Berkeley 
parser](http://code.google.com/p/berkeleyparser/), which is included.
-If it is not the case that English is your target-side language, the target 
side of your training
-data (found at CORPUS.TARGET) must already be parsed in PTB format.  The 
pipeline will notice that
-it is parsed and will not reparse it.
-
-Parsing is affected by both the `--threads N` and `--jobs N` options.  The 
former runs the parser in
-multithreaded mode, while the latter distributes the runs across as cluster 
(and requires some
-configuration, not yet documented).  The options are mutually exclusive.
-
-Once the parsing is complete, there will be two parsed files:
-
-- `$RUNDIR/data/train/corpus.en.parsed`: this is the mixed-case file that was 
parsed.
-- `$RUNDIR/data/train/corpus.parsed.en`: this is a leaf-lowercased version of 
the above file used for
-  grammar extraction.
-
-## 4. THRAX (grammar extraction) <a id="tm" />
-
-The grammar extraction step takes three pieces of data: (1) the 
source-language training corpus, (2)
-the target-language training corpus (parsed, if an SAMT grammar is being 
extracted), and (3) the
-alignment file.  From these, it computes a synchronous context-free grammar.  
If you already have a
-grammar and wish to skip this step, you can do so passing the grammar with the 
`--grammar
-/path/to/grammar` flag.
-
-The main variable in grammar extraction is Hadoop.  If you have a Hadoop 
installation, simply ensure
-that the environment variable `$HADOOP` is defined, and Thrax will seamlessly 
use it.  If you *do
-not* have a Hadoop installation, the pipeline will roll out out for you, 
running Hadoop in
-standalone mode (this mode is triggered when `$HADOOP` is undefined).  
Theoretically, any grammar
-extractable on a full Hadoop cluster should be extractable in standalone mode, 
if you are patient
-enough; in practice, you probably are not patient enough, and will be limited 
to smaller
-datasets. You may also run into problems with disk space; Hadoop uses a lot 
(use `--tmp
-/path/to/tmp` to specify an alternate place for temporary data; we suggest you 
use a local disk
-partition with tens or hundreds of gigabytes free, and not an NFS partition).  
Setting up your own
-Hadoop cluster is not too difficult a chore; in particular, you may find it 
helpful to install a
-[pseudo-distributed version of 
Hadoop](http://hadoop.apache.org/common/docs/r0.20.2/quickstart.html).
-In our experience, this works fine, but you should note the following caveats:
-
-- It is of crucial importance that you have enough physical disks.  We have 
found that having too
-  few, or too slow of disks, results in a whole host of seemingly unrelated 
issues that are hard to
-  resolve, such as timeouts.  
-- NFS filesystems can cause lots of problems.  You should really try to 
install physical disks that
-  are dedicated to Hadoop scratch space.
-
-Here are some flags relevant to Hadoop and grammar extraction with Thrax:
-
-- `--hadoop /path/to/hadoop`
-
-  This sets the location of Hadoop (overriding the environment variable 
`$HADOOP`)
-  
-- `--hadoop-mem MEM` (2g)
-
-  This alters the amount of memory available to Hadoop mappers (passed via the
-  `mapred.child.java.opts` options).
-  
-- `--thrax-conf FILE`
-
-   Use the provided Thrax configuration file instead of the (grammar-specific) 
default.  The Thrax
-   templates are located at 
`$JOSHUA/scripts/training/templates/thrax-TYPE.conf`, where TYPE is one
-   of "hiero" or "samt".
-  
-When the grammar is extracted, it is compressed and placed at 
`$RUNDIR/grammar.gz`.
-
-## <a id="lm" /> 5. Language model
-
-Before tuning can take place, a language model is needed.  A language model is 
always built from the
-target side of the training corpus unless `--no-corpus-lm` is specified.  In 
addition, you can
-provide other language models (any number of them) with the `--lmfile FILE` 
argument.  Other
-arguments are as follows.
-
--  `--lm` {kenlm (default), berkeleylm}
-
-   This determines the language model code that will be used when decoding.  
These implementations
-   are described in their respective papers (PDFs:
-   [KenLM](http://kheafield.com/professional/avenue/kenlm.pdf),
-   
[BerkeleyLM](http://nlp.cs.berkeley.edu/pubs/Pauls-Klein_2011_LM_paper.pdf)). 
KenLM is written in
-   C++ and requires a pass through the JNI, but is recommended because it 
supports left-state minimization.
-   
-- `--lmfile FILE`
-
-  Specifies a pre-built language model to use when decoding.  This language 
model can be in ARPA
-  format, or in KenLM format when using KenLM or BerkeleyLM format when using 
that format.
-
-- `--lm-gen` {kenlm (default), srilm, berkeleylm}, `--buildlm-mem MEM`, 
`--witten-bell`
-
-  At the tuning step, an LM is built from the target side of the training data 
(unless
-  `--no-corpus-lm` is specified).  This controls which code is used to build 
it.  The default is a
-  KenLM's [lmplz](http://kheafield.com/code/kenlm/estimation/), and is 
strongly recommended.
-  
-  If SRILM is used, it is called with the following arguments:
-  
-        $SRILM/bin/i686-m64/ngram-count -interpolate SMOOTHING -order 5 -text 
TRAINING-DATA -unk -lm lm.gz
-        
-  Where SMOOTHING is `-kndiscount`, or `-wbdiscount` if `--witten-bell` is 
passed to the pipeline.
-  
-  [BerkeleyLM java 
class](http://code.google.com/p/berkeleylm/source/browse/trunk/src/edu/berkeley/nlp/lm/io/MakeKneserNeyArpaFromText.java)
-  is also available. It computes a Kneser-Ney LM with a constant discounting 
(0.75) and no count
-  thresholding.  The flag `--buildlm-mem` can be used to control how much 
memory is allocated to the
-  Java process.  The default is "2g", but you will want to increase it for 
larger language models.
-  
-  A language model built from the target side of the training data is placed 
at `$RUNDIR/lm.gz`.  
-
-## Interlude: decoder arguments
-
-Running the decoder is done in both the tuning stage and the testing stage.  A 
critical point is
-that you have to give the decoder enough memory to run.  Joshua can be very 
memory-intensive, in
-particular when decoding with large grammars and large language models.  The 
default amount of
-memory is 3100m, which is likely not enough (especially if you are decoding 
with SAMT grammar).  You
-can alter the amount of memory for Joshua using the `--joshua-mem MEM` 
argument, where MEM is a Java
-memory specification (passed to its `-Xmx` flag).
-
-## <a id="tuning" /> 6. TUNING
-
-Two optimizers are provided with Joshua: MERT and PRO (`--tuner {mert,pro}`).  
If Moses is
-installed, you can also use Cherry & Foster's k-best batch MIRA (`--tuner 
mira`, recommended).
-Tuning is run till convergence in the `$RUNDIR/tune/N` directory, where N is 
the tuning instance.
-By default, tuning is run just once, but the pipeline supports running the 
optimizer an arbitrary
-number of times due to [recent 
work](http://www.youtube.com/watch?v=BOa3XDkgf0Y) pointing out the
-variance of tuning procedures in machine translation, in particular MERT.  
This can be activated
-with `--optimizer-runs N`.  Each run can be found in a directory 
`$RUNDIR/tune/N`.
-
-When tuning is finished, each final configuration file can be found at either
-
-    $RUNDIR/tune/N/joshua.config.final
-
-where N varies from 1..`--optimizer-runs`.
-
-## <a id="testing" /> 7. Testing 
-
-For each of the tuner runs, Joshua takes the tuner output file and decodes the 
test set.  If you
-like, you can also apply minimum Bayes-risk decoding to the decoder output 
with `--mbr`.  This
-usually yields about 0.3 - 0.5 BLEU points, but is time-consuming.
-
-After decoding the test set with each set of tuned weights, Joshua computes 
the mean BLEU score,
-writes it to `$RUNDIR/test/final-bleu`, and cats it. It also writes a file
-`$RUNDIR/test/final-times` containing a summary of runtime information. That's 
the end of the pipeline!
-
-Joshua also supports decoding further test sets.  This is enabled by rerunning 
the pipeline with a
-number of arguments:
-
--   `--first-step TEST`
-
-    This tells the decoder to start at the test step.
-
--   `--name NAME`
-
-    A name is needed to distinguish this test set from the previous ones.  
Output for this test run
-    will be stored at `$RUNDIR/test/NAME`.
-    
--   `--joshua-config CONFIG`
-
-    A tuned parameter file is required.  This file will be the output of some 
prior tuning run.
-    Necessary pathnames and so on will be adjusted.
-    
-## <a id="analysis"> 8. ANALYSIS
-
-If you have used the suggested layout, with a number of related runs all 
contained in a common
-directory with sequential numbers, you can use the script 
`$JOSHUA/scripts/training/summarize.pl` to
-display a summary of the mean BLEU scores from all runs, along with the text 
you placed in the run
-README file (using the pipeline's `--readme TEXT` flag).
-
-## COMMON USE CASES AND PITFALLS 
-
-- If the pipeline dies at the "thrax-run" stage with an error like the 
following:
-
-      JOB FAILED (return code 1) 
-      hadoop/bin/hadoop: line 47: 
-      /some/path/to/a/directory/hadoop/bin/hadoop-config.sh: No such file or 
directory 
-      Exception in thread "main" java.lang.NoClassDefFoundError: 
org/apache/hadoop/fs/FsShell 
-      Caused by: java.lang.ClassNotFoundException: 
org.apache.hadoop.fs.FsShell 
-      
-  This occurs if the `$HADOOP` environment variable is set but does not point 
to a working
-  Hadoop installation.  To fix it, make sure to unset the variable:
-  
-      # in bash
-      unset HADOOP
-      
-  and then rerun the pipeline with the same invocation.
-
-- Memory usage is a major consideration in decoding with Joshua and 
hierarchical grammars.  In
-  particular, SAMT grammars often require a large amount of memory.  Many 
steps have been taken to
-  reduce memory usage, including beam settings and test-set- and 
sentence-level filtering of
-  grammars.  However, memory usage can still be in the tens of gigabytes.
-
-  To accommodate this kind of variation, the pipeline script allows you to 
specify both (a) the
-  amount of memory used by the Joshua decoder instance and (b) the amount of 
memory required of
-  nodes obtained by the qsub command.  These are accomplished with the 
`--joshua-mem` MEM and
-  `--qsub-args` ARGS commands.  For example,
-
-      pipeline.pl --joshua-mem 32g --qsub-args "-l pvmem=32g -q himem.q" ...
-
-  Also, should Thrax fail, it might be due to a memory restriction. By 
default, Thrax requests 2 GB
-  from the Hadoop server. If more memory is needed, set the memory requirement 
with the
-  `--hadoop-mem` in the same way as the `--joshua-mem` option is used.
-
-- Other pitfalls and advice will be added as it is discovered.
-
-## FEEDBACK 
-
-Please email [email protected] with problems or suggestions.
-

http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/5.0/server.html
----------------------------------------------------------------------
diff --git a/5.0/server.html b/5.0/server.html
new file mode 100644
index 0000000..661a8bc
--- /dev/null
+++ b/5.0/server.html
@@ -0,0 +1,196 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <title>Joshua Documentation | Server mode</title>
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <meta name="description" content="">
+    <meta name="author" content="">
+
+    <!-- Le styles -->
+    <link href="/bootstrap/css/bootstrap.css" rel="stylesheet">
+    <style>
+      body {
+        padding-top: 60px; /* 60px to make the container go all the way to the 
bottom of the topbar */
+      }
+      #download {
+          background-color: green;
+          font-size: 14pt;
+          font-weight: bold;
+          text-align: center;
+          color: white;
+          border-radius: 5px;
+          padding: 4px;
+      }
+
+      #download a:link {
+          color: white;
+      }
+
+      #download a:hover {
+          color: lightgrey;
+      }
+
+      #download a:visited {
+          color: white;
+      }
+
+      a.pdf {
+          font-variant: small-caps;
+          /* font-weight: bold; */
+          font-size: 10pt;
+          color: white;
+          background: brown;
+          padding: 2px;
+      }
+
+      a.bibtex {
+          font-variant: small-caps;
+          /* font-weight: bold; */
+          font-size: 10pt;
+          color: white;
+          background: orange;
+          padding: 2px;
+      }
+
+      img.sponsor {
+        height: 120px;
+        margin: 5px;
+      }
+    </style>
+    <link href="bootstrap/css/bootstrap-responsive.css" rel="stylesheet">
+
+    <!-- HTML5 shim, for IE6-8 support of HTML5 elements -->
+    <!--[if lt IE 9]>
+      <script src="bootstrap/js/html5shiv.js"></script>
+    <![endif]-->
+
+    <!-- Fav and touch icons -->
+    <link rel="apple-touch-icon-precomposed" sizes="144x144" 
href="bootstrap/ico/apple-touch-icon-144-precomposed.png">
+    <link rel="apple-touch-icon-precomposed" sizes="114x114" 
href="bootstrap/ico/apple-touch-icon-114-precomposed.png">
+      <link rel="apple-touch-icon-precomposed" sizes="72x72" 
href="bootstrap/ico/apple-touch-icon-72-precomposed.png">
+                    <link rel="apple-touch-icon-precomposed" 
href="bootstrap/ico/apple-touch-icon-57-precomposed.png">
+                                   <link rel="shortcut icon" 
href="bootstrap/ico/favicon.png">
+  </head>
+
+  <body>
+
+    <div class="navbar navbar-inverse navbar-fixed-top">
+      <div class="navbar-inner">
+        <div class="container">
+          <button type="button" class="btn btn-navbar" data-toggle="collapse" 
data-target=".nav-collapse">
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+          </button>
+          <a class="brand" href="/">Joshua</a>
+          <div class="nav-collapse collapse">
+            <ul class="nav">
+              <li><a href="index.html">Documentation</a></li>
+              <li><a href="pipeline.html">Pipeline</a></li>
+              <li><a href="tutorial.html">Tutorial</a></li>
+              <li><a href="decoder.html">Decoder</a></li>
+              <li><a href="thrax.html">Thrax</a></li>
+              <li><a href="file-formats.html">File formats</a></li>
+              <!-- <li><a href="advanced.html">Advanced</a></li> -->
+              <li><a href="faq.html">FAQ</a></li>
+            </ul>
+          </div><!--/.nav-collapse -->
+        </div>
+      </div>
+    </div>
+
+    <div class="container">
+
+      <div class="row">
+        <div class="span2">
+          <img src="/images/joshua-logo-small.png" 
+               alt="Joshua logo (picture of a Joshua tree)" />
+        </div>
+        <div class="span10">
+          <h1>Joshua Documentation</h1>
+          <h2>Server mode</h2>
+          <span id="download">
+            <a 
href="http://cs.jhu.edu/~post/files/joshua-v5.0.tgz";>Download</a>
+          </span>
+          &nbsp; (version 5.0, released 16 August 2013)
+        </div>
+      </div>
+      
+      <hr />
+
+      <div class="row">
+        <div class="span8">
+
+          <p>The Joshua decoder can be run as a TCP/IP server instead of a 
POSIX-style command-line tool. Clients can concurrently connect to a socket and 
receive a set of newline-separated outputs for a set of newline-separated 
inputs.</p>
+
+<p>Threading takes place both within and across requests.  Threads from the 
decoder pool are assigned in round-robin manner across requests, preventing 
starvation.</p>
+
+<h1 id="invoking-the-server">Invoking the server</h1>
+
+<p>A running server is configured at invokation time. To start in server mode, 
run <code class="highlighter-rouge">joshua-decoder</code> with the option <code 
class="highlighter-rouge">-server-port [PORT]</code>. Additionally, the server 
can be configured in the same ways as when using the 
command-line-functionality.</p>
+
+<p>E.g.,</p>
+
+<div class="highlighter-rouge"><pre 
class="highlight"><code>$JOSHUA/bin/joshua-decoder -server-port 10101 
-mark-oovs false -output-format "%s" -threads 10
+</code></pre>
+</div>
+
+<h2 id="using-the-server">Using the server</h2>
+
+<p>To test that the server is working, a set of inputs can be sent to the 
server from the command line. </p>
+
+<p>The server, as configured in the example above, will then respond to 
requests on port 10101.  You can test it out with the <code 
class="highlighter-rouge">nc</code> utility:</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>wget -qO - 
http://cs.jhu.edu/~post/files/pg1023.txt | head -132 | tail -11 | nc localhost 
10101
+</code></pre>
+</div>
+
+<p>Since no model was loaded, this will just return the text to you as sent to 
the server.</p>
+
+<p>The <code class="highlighter-rouge">-server-port</code> option can also be 
used when creating a <a href="bundle.html">bundled configuration</a> that will 
be run in server mode.</p>
+
+
+        </div>
+      </div>
+    </div> <!-- /container -->
+
+    <!-- Le javascript
+    ================================================== -->
+    <!-- Placed at the end of the document so the pages load faster -->
+    <script src="bootstrap/js/jquery.js"></script>
+    <script src="bootstrap/js/bootstrap-transition.js"></script>
+    <script src="bootstrap/js/bootstrap-alert.js"></script>
+    <script src="bootstrap/js/bootstrap-modal.js"></script>
+    <script src="bootstrap/js/bootstrap-dropdown.js"></script>
+    <script src="bootstrap/js/bootstrap-scrollspy.js"></script>
+    <script src="bootstrap/js/bootstrap-tab.js"></script>
+    <script src="bootstrap/js/bootstrap-tooltip.js"></script>
+    <script src="bootstrap/js/bootstrap-popover.js"></script>
+    <script src="bootstrap/js/bootstrap-button.js"></script>
+    <script src="bootstrap/js/bootstrap-collapse.js"></script>
+    <script src="bootstrap/js/bootstrap-carousel.js"></script>
+    <script src="bootstrap/js/bootstrap-typeahead.js"></script>
+
+    <!-- Start of StatCounter Code for Default Guide -->
+    <script type="text/javascript">
+      var sc_project=8264132; 
+      var sc_invisible=1; 
+      var sc_security="4b97fe2d"; 
+    </script>
+    <script type="text/javascript" 
src="http://www.statcounter.com/counter/counter.js";></script>
+    <noscript>
+      <div class="statcounter">
+        <a title="hit counter joomla" 
+           href="http://statcounter.com/joomla/";
+           target="_blank">
+          <img class="statcounter"
+               src="http://c.statcounter.com/8264132/0/4b97fe2d/1/";
+               alt="hit counter joomla" />
+        </a>
+      </div>
+    </noscript>
+    <!-- End of StatCounter Code for Default Guide -->
+
+  </body>
+</html>

[37/44] incubator-joshua-site git commit: First attempt

Reply via email to