http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/thrax.html ---------------------------------------------------------------------- diff --git a/4.0/thrax.html b/4.0/thrax.html new file mode 100644 index 0000000..87ec518 --- /dev/null +++ b/4.0/thrax.html @@ -0,0 +1,264 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> + +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> + <head> + <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> + <link rel="stylesheet" type="text/css" media="screen,print" href="../joshua4.css" /> + <title>Joshua | Grammar extraction with Thrax</title> + </head> + + <body> + + <div id="navbar"> + <a href="http://joshua-decoder.org/"> + <img src="../images/joshua-logo-small.png" width="130px" + alt="Joshua logo (picture of a Joshua tree)" /> + </a> + + <p class="infobox"> + <b>Stable version</b><br /> + 4.1<br/><br/> + <b>Release date</b><br /> + 2013 January + </p> + +<!-- <div class="infobox"> --> +<!-- <b>AUTO LINKS</b><br/> --> +<!-- <ul> --> +<!-- --> +<!-- <li> Advanced features</li> --> +<!-- --> +<!-- <li> Advanced features</li> --> +<!-- --> +<!-- <li> Advanced features</li> --> +<!-- --> +<!-- <li> Building a language pack</li> --> +<!-- --> +<!-- <li> Building a language pack</li> --> +<!-- --> +<!-- <li> Bundling a configuration</li> --> +<!-- --> +<!-- <li> Contributors</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Frequently Asked Questions</li> --> +<!-- --> +<!-- <li> Common problems</li> --> +<!-- --> +<!-- <li> Frequently Asked Questions</li> --> +<!-- --> +<!-- <li> Common problems</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> Fisher and CALLHOME Spanish English Speech Translation Corpus</li> --> +<!-- --> +<!-- <li> Indian Languages Parallel Corpora</li> --> +<!-- --> +<!-- <li> Joshua 4.0 User Documentation</li> --> +<!-- --> +<!-- <li> Language packs</li> --> +<!-- --> +<!-- <li> Paraphrase Packs</li> --> +<!-- --> +<!-- <li> Joshua releases</li> --> +<!-- --> +<!-- <li> Support</li> --> +<!-- --> +<!-- <li> Getting Started</li> --> +<!-- --> +<!-- <li> Welcome to Joshua</li> --> +<!-- --> +<!-- <li> Joshua documentation</li> --> +<!-- --> +<!-- <li> Joshua documentation</li> --> +<!-- --> +<!-- <li> Installation</li> --> +<!-- --> +<!-- <li> Installation</li> --> +<!-- --> +<!-- <li> Alignment with Jacana</li> --> +<!-- --> +<!-- <li> Alignment with Jacana</li> --> +<!-- --> +<!-- <li> Alignment with Jacana</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Lattice decoding</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> Quick Start</li> --> +<!-- --> +<!-- <li> Quick Start</li> --> +<!-- --> +<!-- <li> Releases</li> --> +<!-- --> +<!-- <li> Server mode</li> --> +<!-- --> +<!-- <li> Server mode</li> --> +<!-- --> +<!-- <li> Server mode</li> --> +<!-- --> +<!-- <li> Installing and running the Joshua Decoder</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Pipeline tutorial</li> --> +<!-- --> +<!-- <li> Pipeline tutorial</li> --> +<!-- --> +<!-- <li> Pipeline tutorial</li> --> +<!-- --> +<!-- <li> What's New</li> --> +<!-- --> +<!-- <li> What's New</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- </ul> --> +<!-- </div> --> + + <div class="infobox"> + + <b>Links</b><br /> + <ul> + <li> <a href="../index.html">Main</a> </li> + <li> <a href="pipeline.html">Pipeline</a> </li> + <li> <a href="step-by-step-instructions.html">Manual walkthrough</a> </li> + <li> <a href="decoder.html">Decoder</a> </li> + <li> <a href="server.html">Decoder Server</a> </li> + <li> <a href="file-formats.html">File formats</a> </li> + <li> <a href="thrax.html">Grammar Extraction</a> </li> + <li> <a href="../releases.html">Releases</a> </li> + </ul> + </div> + + <div class="infobox"> + <b>Advanced</b><br /> + <ul> +<!-- <li> <a href="packing.html">Grammar packing</a> </li> --> + <li> <a href="large-lms.html">Building large LMs</a> </li> + <li> <a href="zmert.html">Running Z-MERT</a> </li> + <li> <a href="lattice.html">Lattices</a> </li> + <li> <a href="server.html">TCP/IP server</a> </li> + <li> <a href="bundle.html">Bundled configuration</a> </li> + </ul> + </div> + + <div class="infobox"> + <b>Help</b><br /> + <ul> + <li> <a href="faq.html">Answers</a> </li> + <li> <a href="https://groups.google.com/d/forum/joshua_support">Archive</a> </li> + </ul> + </div> + + <div class="footer"> + Last updated on April 08, 2016 + </div> + + </div> + + <div id="main"> + <div id="title"> + <h1>Grammar extraction with Thrax</h1> + </div> + + <div id="content"> + + <p>One day, this will hold Thrax documentation, including how to use Thrax, how to do grammar +filtering, and details on the configuration file options. It will also include details about our +experience setting up and maintaining Hadoop cluster installations, knowledge wrought of hard-fought +sweat and tears.</p> + +<p>In the meantime, please bother <a href="http://cs.jhu.edu/~jonny/">Jonny Weese</a> if there is something you +need to do that you donât understand. You might also be able to dig up some information <a href="http://cs.jhu.edu/~jonny/thrax/">on the old +Thrax page</a>.</p> + + + </div> + </div> + + </body> +</html> + + + + +
http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/thrax.md ---------------------------------------------------------------------- diff --git a/4.0/thrax.md b/4.0/thrax.md deleted file mode 100644 index 6b276b0..0000000 --- a/4.0/thrax.md +++ /dev/null @@ -1,14 +0,0 @@ ---- -layout: default4 -category: advanced -title: Grammar extraction with Thrax ---- - -One day, this will hold Thrax documentation, including how to use Thrax, how to do grammar -filtering, and details on the configuration file options. It will also include details about our -experience setting up and maintaining Hadoop cluster installations, knowledge wrought of hard-fought -sweat and tears. - -In the meantime, please bother [Jonny Weese](http://cs.jhu.edu/~jonny/) if there is something you -need to do that you don't understand. You might also be able to dig up some information [on the old -Thrax page](http://cs.jhu.edu/~jonny/thrax/). http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/tms.html ---------------------------------------------------------------------- diff --git a/4.0/tms.html b/4.0/tms.html new file mode 100644 index 0000000..1e38df8 --- /dev/null +++ b/4.0/tms.html @@ -0,0 +1,377 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> + +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> + <head> + <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> + <link rel="stylesheet" type="text/css" media="screen,print" href="../joshua4.css" /> + <title>Joshua | Building Translation Models</title> + </head> + + <body> + + <div id="navbar"> + <a href="http://joshua-decoder.org/"> + <img src="../images/joshua-logo-small.png" width="130px" + alt="Joshua logo (picture of a Joshua tree)" /> + </a> + + <p class="infobox"> + <b>Stable version</b><br /> + 4.1<br/><br/> + <b>Release date</b><br /> + 2013 January + </p> + +<!-- <div class="infobox"> --> +<!-- <b>AUTO LINKS</b><br/> --> +<!-- <ul> --> +<!-- --> +<!-- <li> Advanced features</li> --> +<!-- --> +<!-- <li> Advanced features</li> --> +<!-- --> +<!-- <li> Advanced features</li> --> +<!-- --> +<!-- <li> Building a language pack</li> --> +<!-- --> +<!-- <li> Building a language pack</li> --> +<!-- --> +<!-- <li> Bundling a configuration</li> --> +<!-- --> +<!-- <li> Contributors</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Frequently Asked Questions</li> --> +<!-- --> +<!-- <li> Common problems</li> --> +<!-- --> +<!-- <li> Frequently Asked Questions</li> --> +<!-- --> +<!-- <li> Common problems</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> Fisher and CALLHOME Spanish English Speech Translation Corpus</li> --> +<!-- --> +<!-- <li> Indian Languages Parallel Corpora</li> --> +<!-- --> +<!-- <li> Joshua 4.0 User Documentation</li> --> +<!-- --> +<!-- <li> Language packs</li> --> +<!-- --> +<!-- <li> Paraphrase Packs</li> --> +<!-- --> +<!-- <li> Joshua releases</li> --> +<!-- --> +<!-- <li> Support</li> --> +<!-- --> +<!-- <li> Getting Started</li> --> +<!-- --> +<!-- <li> Welcome to Joshua</li> --> +<!-- --> +<!-- <li> Joshua documentation</li> --> +<!-- --> +<!-- <li> Joshua documentation</li> --> +<!-- --> +<!-- <li> Installation</li> --> +<!-- --> +<!-- <li> Installation</li> --> +<!-- --> +<!-- <li> Alignment with Jacana</li> --> +<!-- --> +<!-- <li> Alignment with Jacana</li> --> +<!-- --> +<!-- <li> Alignment with Jacana</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Lattice decoding</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> Quick Start</li> --> +<!-- --> +<!-- <li> Quick Start</li> --> +<!-- --> +<!-- <li> Releases</li> --> +<!-- --> +<!-- <li> Server mode</li> --> +<!-- --> +<!-- <li> Server mode</li> --> +<!-- --> +<!-- <li> Server mode</li> --> +<!-- --> +<!-- <li> Installing and running the Joshua Decoder</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Pipeline tutorial</li> --> +<!-- --> +<!-- <li> Pipeline tutorial</li> --> +<!-- --> +<!-- <li> Pipeline tutorial</li> --> +<!-- --> +<!-- <li> What's New</li> --> +<!-- --> +<!-- <li> What's New</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- </ul> --> +<!-- </div> --> + + <div class="infobox"> + + <b>Links</b><br /> + <ul> + <li> <a href="../index.html">Main</a> </li> + <li> <a href="pipeline.html">Pipeline</a> </li> + <li> <a href="step-by-step-instructions.html">Manual walkthrough</a> </li> + <li> <a href="decoder.html">Decoder</a> </li> + <li> <a href="server.html">Decoder Server</a> </li> + <li> <a href="file-formats.html">File formats</a> </li> + <li> <a href="thrax.html">Grammar Extraction</a> </li> + <li> <a href="../releases.html">Releases</a> </li> + </ul> + </div> + + <div class="infobox"> + <b>Advanced</b><br /> + <ul> +<!-- <li> <a href="packing.html">Grammar packing</a> </li> --> + <li> <a href="large-lms.html">Building large LMs</a> </li> + <li> <a href="zmert.html">Running Z-MERT</a> </li> + <li> <a href="lattice.html">Lattices</a> </li> + <li> <a href="server.html">TCP/IP server</a> </li> + <li> <a href="bundle.html">Bundled configuration</a> </li> + </ul> + </div> + + <div class="infobox"> + <b>Help</b><br /> + <ul> + <li> <a href="faq.html">Answers</a> </li> + <li> <a href="https://groups.google.com/d/forum/joshua_support">Archive</a> </li> + </ul> + </div> + + <div class="footer"> + Last updated on April 08, 2016 + </div> + + </div> + + <div id="main"> + <div id="title"> + <h1>Building Translation Models</h1> + </div> + + <div id="content"> + + <h1 id="build-a-translation-model">Build a translation model</h1> + +<p>Extracting a grammar from a large amount of data is a multi-step process. The first requirement is parallel data. The Europarl, Call Home, and Fisher corpora all contain parallel translations of Spanish and English sentences.</p> + +<p>We will copy (or symlink) the parallel source text files in a subdirectory called <code class="highlighter-rouge">input/</code>.</p> + +<p>Then, we concatenate all the training files on each side. The pipeline script normally does tokenization and normalization, but in this instance we have a custom tokenizer we need to apply to the source side, so we have to do it manually and then skip that step using the <code class="highlighter-rouge">pipeline.pl</code> option <code class="highlighter-rouge">--first-step alignment</code>.</p> + +<ul> + <li> + <p>to tokenize the English data, do</p> + + <table> + <tbody> + <tr> + <td>cat callhome.en europarl.en fisher.en > all.en</td> + <td>$JOSHUA/scripts/training/normalize-punctuation.pl en</td> + <td>$JOSHUA/scripts/training/penn-treebank-tokenizer.perl</td> + <td>$JOSHUA/scripts/lowercase.perl > all.norm.tok.lc.en</td> + </tr> + </tbody> + </table> + </li> +</ul> + +<p>The same can be done for the Spanish side of the input data:</p> + +<div class="highlighter-rouge"><pre class="highlight"><code>cat callhome.es europarl.es fisher.es > all.es | $JOSHUA/scripts/training/normalize-punctuation.pl es | $JOSHUA/scripts/training/penn-treebank-tokenizer.perl | $JOSHUA/scripts/lowercase.perl > all.norm.tok.lc.es +</code></pre> +</div> + +<p>By the way, an alternative tokenizer is a Twitter tokenizer found in the <a href="http://github.com/vandurme/jerboa">Jerboa</a> project.</p> + +<p>The final step in the training data preparation is to remove all examples in which either of the language sides is a blank line.</p> + +<div class="highlighter-rouge"><pre class="highlight"><code>paste all.norm.tok.lc.es all.norm.tok.lc.en | grep -Pv "^\t|\t$" \ + | ./splittabs.pl all.norm.tok.lc.noblanks.es all.norm.tok.lc.noblanks.en +</code></pre> +</div> + +<p>contents of <code class="highlighter-rouge">splittabls.pl</code> by Matt Post:</p> + +<div class="highlighter-rouge"><pre class="highlight"><code><span class="c1">#!/usr/bin/perl</span> + +<span class="c1"># splits on tab, printing respective chunks to the list of files given</span> +<span class="c1"># as script arguments</span> + +<span class="k">use</span> <span class="nv">FileHandle</span><span class="p">;</span> + +<span class="k">my</span> <span class="nv">@fh</span><span class="p">;</span> +<span class="vg">$|</span> <span class="o">=</span> <span class="mi">1</span><span class="p">;</span> <span class="c1"># don't buffer output</span> + +<span class="k">if</span> <span class="p">(</span><span class="nv">@ARGV</span> <span class="o"><</span> <span class="mi">0</span><span class="p">)</span> <span class="p">{</span> + <span class="k">print</span> <span class="s">"Usage: splittabs.pl < tabbed-file\n"</span><span class="p">;</span> + <span class="nb">exit</span><span class="p">;</span> +<span class="p">}</span> + +<span class="k">my</span> <span class="nv">@fh</span> <span class="o">=</span> <span class="nb">map</span> <span class="p">{</span> <span class="nv">get_filehandle</span><span class="p">(</span><span class="nv">$_</span><span class="p">)</span> <span class="p">}</span> <span class="nv">@ARGV</span><span class="p">;</span> +<span class="nv">@ARGV</span> <span class="o">=</span> <span class="p">();</span> + +<span class="k">while</span> <span class="p">(</span><span class="k">my</span> <span class="nv">$line</span> <span class="o">=</span> <span class="o"><></span><span class="p">)</span> <span class="p">{</span> + <span class="nb">chomp</span><span class="p">(</span><span class="nv">$line</span><span class="p">);</span> + <span class="k">my</span> <span class="p">(</span><span class="nv">@fields</span><span class="p">)</span> <span class="o">=</span> <span class="nb">split</span><span class="p">(</span><span class="sr">/\t/</span><span class="p">,</span><span class="nv">$line</span><span class="p">,</span><span class="nb">scalar</span> <span class="nv">@fh</span><span class="p">);</span> + + <span class="nb">map</span> <span class="p">{</span> <span class="k">print</span> <span class="p">{</span><span class="nv">$fh</span><span class="p">[</span><span class="nv">$_</span><span class="p">]}</span> <span class="s">"$fields[$_]\n"</span> <span class="p">}</span> <span class="p">(</span><span class="mi">0</span><span class="o">..</span><span class="nv">$#fields</span><span class="p">);</span> +<span class="p">}</span> + +<span class="k">sub </span><span class="nf">get_filehandle</span> <span class="p">{</span> + <span class="k">my</span> <span class="nv">$file</span> <span class="o">=</span> <span class="nb">shift</span><span class="p">;</span> + + <span class="k">if</span> <span class="p">(</span><span class="nv">$file</span> <span class="ow">eq</span> <span class="s">"-"</span><span class="p">)</span> <span class="p">{</span> + <span class="k">return</span> <span class="o">*</span><span class="bp">STDOUT</span><span class="p">;</span> + <span class="p">}</span> <span class="k">else</span> <span class="p">{</span> + <span class="nb">local</span> <span class="o">*</span><span class="nv">FH</span><span class="p">;</span> + <span class="nb">open</span> <span class="nv">FH</span><span class="p">,</span> <span class="s">">$file"</span> <span class="ow">or</span> <span class="nb">die</span> <span class="s">"can't open '$file' for writing"</span><span class="p">;</span> + <span class="k">return</span> <span class="o">*</span><span class="nv">FH</span><span class="p">;</span> + <span class="p">}</span> +<span class="p">}</span> +</code></pre> +</div> + +<p>Now we can run the pipeline to extract the grammar. Run the following script:</p> + +<div class="highlighter-rouge"><pre class="highlight"><code><span class="c">#!/bin/bash</span> + +<span class="c"># this creates a grammar</span> + +<span class="c"># NEED:</span> +<span class="c"># pair</span> +<span class="c"># type</span> + +<span class="nb">set</span> -u + +<span class="nv">pair</span><span class="o">=</span>es-en +<span class="nb">type</span><span class="o">=</span>hiero + +<span class="c">#. ~/.bashrc</span> + +<span class="c">#basedir=$(pwd)</span> + +<span class="nv">dir</span><span class="o">=</span>grammar-<span class="nv">$pair</span>-<span class="nv">$type</span> + +<span class="o">[[</span> ! -d <span class="nv">$dir</span> <span class="o">]]</span> <span class="o">&&</span> mkdir -p <span class="nv">$dir</span> +<span class="nb">cd</span> <span class="nv">$dir</span> + +<span class="nb">source</span><span class="o">=</span><span class="k">$(</span><span class="nb">echo</span> <span class="nv">$pair</span> | cut -d- -f 1<span class="k">)</span> +<span class="nv">target</span><span class="o">=</span><span class="k">$(</span><span class="nb">echo</span> <span class="nv">$pair</span> | cut -d- -f 2<span class="k">)</span> + +<span class="nv">$JOSHUA</span>/scripts/training/pipeline.pl <span class="se">\</span> + --source <span class="nv">$source</span> <span class="se">\</span> + --target <span class="nv">$target</span> <span class="se">\</span> + --corpus /home/hltcoe/lorland/expts/scale12/model1/input/all.norm.tok.lc.noblanks <span class="se">\</span> + --type <span class="nv">$type</span> <span class="se">\</span> + --joshua-mem 100g <span class="se">\</span> + --no-prepare <span class="se">\</span> + --first-step align <span class="se">\</span> + --last-step thrax <span class="se">\</span> + --hadoop <span class="nv">$HADOOP</span> <span class="se">\</span> + --threads 8 <span class="se">\</span> +</code></pre> +</div> + + + </div> + </div> + + </body> +</html> + + + + + http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/tms.md ---------------------------------------------------------------------- diff --git a/4.0/tms.md b/4.0/tms.md deleted file mode 100644 index a86a311..0000000 --- a/4.0/tms.md +++ /dev/null @@ -1,106 +0,0 @@ ---- -layout: default4 -category: advanced -title: Building Translation Models ---- - -# Build a translation model - -Extracting a grammar from a large amount of data is a multi-step process. The first requirement is parallel data. The Europarl, Call Home, and Fisher corpora all contain parallel translations of Spanish and English sentences. - -We will copy (or symlink) the parallel source text files in a subdirectory called `input/`. - -Then, we concatenate all the training files on each side. The pipeline script normally does tokenization and normalization, but in this instance we have a custom tokenizer we need to apply to the source side, so we have to do it manually and then skip that step using the `pipeline.pl` option `--first-step alignment`. - -* to tokenize the English data, do - - cat callhome.en europarl.en fisher.en > all.en | $JOSHUA/scripts/training/normalize-punctuation.pl en | $JOSHUA/scripts/training/penn-treebank-tokenizer.perl | $JOSHUA/scripts/lowercase.perl > all.norm.tok.lc.en - -The same can be done for the Spanish side of the input data: - - cat callhome.es europarl.es fisher.es > all.es | $JOSHUA/scripts/training/normalize-punctuation.pl es | $JOSHUA/scripts/training/penn-treebank-tokenizer.perl | $JOSHUA/scripts/lowercase.perl > all.norm.tok.lc.es - -By the way, an alternative tokenizer is a Twitter tokenizer found in the [Jerboa](http://github.com/vandurme/jerboa) project. - -The final step in the training data preparation is to remove all examples in which either of the language sides is a blank line. - - paste all.norm.tok.lc.es all.norm.tok.lc.en | grep -Pv "^\t|\t$" \ - | ./splittabs.pl all.norm.tok.lc.noblanks.es all.norm.tok.lc.noblanks.en - -contents of `splittabls.pl` by Matt Post: - - #!/usr/bin/perl - - # splits on tab, printing respective chunks to the list of files given - # as script arguments - - use FileHandle; - - my @fh; - $| = 1; # don't buffer output - - if (@ARGV < 0) { - print "Usage: splittabs.pl < tabbed-file\n"; - exit; - } - - my @fh = map { get_filehandle($_) } @ARGV; - @ARGV = (); - - while (my $line = <>) { - chomp($line); - my (@fields) = split(/\t/,$line,scalar @fh); - - map { print {$fh[$_]} "$fields[$_]\n" } (0..$#fields); - } - - sub get_filehandle { - my $file = shift; - - if ($file eq "-") { - return *STDOUT; - } else { - local *FH; - open FH, ">$file" or die "can't open '$file' for writing"; - return *FH; - } - } - -Now we can run the pipeline to extract the grammar. Run the following script: - - #!/bin/bash - - # this creates a grammar - - # NEED: - # pair - # type - - set -u - - pair=es-en - type=hiero - - #. ~/.bashrc - - #basedir=$(pwd) - - dir=grammar-$pair-$type - - [[ ! -d $dir ]] && mkdir -p $dir - cd $dir - - source=$(echo $pair | cut -d- -f 1) - target=$(echo $pair | cut -d- -f 2) - - $JOSHUA/scripts/training/pipeline.pl \ - --source $source \ - --target $target \ - --corpus /home/hltcoe/lorland/expts/scale12/model1/input/all.norm.tok.lc.noblanks \ - --type $type \ - --joshua-mem 100g \ - --no-prepare \ - --first-step align \ - --last-step thrax \ - --hadoop $HADOOP \ - --threads 8 \ http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/zmert.html ---------------------------------------------------------------------- diff --git a/4.0/zmert.html b/4.0/zmert.html new file mode 100644 index 0000000..a589161 --- /dev/null +++ b/4.0/zmert.html @@ -0,0 +1,339 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> + +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> + <head> + <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> + <link rel="stylesheet" type="text/css" media="screen,print" href="../joshua4.css" /> + <title>Joshua | Z-MERT</title> + </head> + + <body> + + <div id="navbar"> + <a href="http://joshua-decoder.org/"> + <img src="../images/joshua-logo-small.png" width="130px" + alt="Joshua logo (picture of a Joshua tree)" /> + </a> + + <p class="infobox"> + <b>Stable version</b><br /> + 4.1<br/><br/> + <b>Release date</b><br /> + 2013 January + </p> + +<!-- <div class="infobox"> --> +<!-- <b>AUTO LINKS</b><br/> --> +<!-- <ul> --> +<!-- --> +<!-- <li> Advanced features</li> --> +<!-- --> +<!-- <li> Advanced features</li> --> +<!-- --> +<!-- <li> Advanced features</li> --> +<!-- --> +<!-- <li> Building a language pack</li> --> +<!-- --> +<!-- <li> Building a language pack</li> --> +<!-- --> +<!-- <li> Bundling a configuration</li> --> +<!-- --> +<!-- <li> Contributors</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Frequently Asked Questions</li> --> +<!-- --> +<!-- <li> Common problems</li> --> +<!-- --> +<!-- <li> Frequently Asked Questions</li> --> +<!-- --> +<!-- <li> Common problems</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> Fisher and CALLHOME Spanish English Speech Translation Corpus</li> --> +<!-- --> +<!-- <li> Indian Languages Parallel Corpora</li> --> +<!-- --> +<!-- <li> Joshua 4.0 User Documentation</li> --> +<!-- --> +<!-- <li> Language packs</li> --> +<!-- --> +<!-- <li> Paraphrase Packs</li> --> +<!-- --> +<!-- <li> Joshua releases</li> --> +<!-- --> +<!-- <li> Support</li> --> +<!-- --> +<!-- <li> Getting Started</li> --> +<!-- --> +<!-- <li> Welcome to Joshua</li> --> +<!-- --> +<!-- <li> Joshua documentation</li> --> +<!-- --> +<!-- <li> Joshua documentation</li> --> +<!-- --> +<!-- <li> Installation</li> --> +<!-- --> +<!-- <li> Installation</li> --> +<!-- --> +<!-- <li> Alignment with Jacana</li> --> +<!-- --> +<!-- <li> Alignment with Jacana</li> --> +<!-- --> +<!-- <li> Alignment with Jacana</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Lattice decoding</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> Quick Start</li> --> +<!-- --> +<!-- <li> Quick Start</li> --> +<!-- --> +<!-- <li> Releases</li> --> +<!-- --> +<!-- <li> Server mode</li> --> +<!-- --> +<!-- <li> Server mode</li> --> +<!-- --> +<!-- <li> Server mode</li> --> +<!-- --> +<!-- <li> Installing and running the Joshua Decoder</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Pipeline tutorial</li> --> +<!-- --> +<!-- <li> Pipeline tutorial</li> --> +<!-- --> +<!-- <li> Pipeline tutorial</li> --> +<!-- --> +<!-- <li> What's New</li> --> +<!-- --> +<!-- <li> What's New</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- </ul> --> +<!-- </div> --> + + <div class="infobox"> + + <b>Links</b><br /> + <ul> + <li> <a href="../index.html">Main</a> </li> + <li> <a href="pipeline.html">Pipeline</a> </li> + <li> <a href="step-by-step-instructions.html">Manual walkthrough</a> </li> + <li> <a href="decoder.html">Decoder</a> </li> + <li> <a href="server.html">Decoder Server</a> </li> + <li> <a href="file-formats.html">File formats</a> </li> + <li> <a href="thrax.html">Grammar Extraction</a> </li> + <li> <a href="../releases.html">Releases</a> </li> + </ul> + </div> + + <div class="infobox"> + <b>Advanced</b><br /> + <ul> +<!-- <li> <a href="packing.html">Grammar packing</a> </li> --> + <li> <a href="large-lms.html">Building large LMs</a> </li> + <li> <a href="zmert.html">Running Z-MERT</a> </li> + <li> <a href="lattice.html">Lattices</a> </li> + <li> <a href="server.html">TCP/IP server</a> </li> + <li> <a href="bundle.html">Bundled configuration</a> </li> + </ul> + </div> + + <div class="infobox"> + <b>Help</b><br /> + <ul> + <li> <a href="faq.html">Answers</a> </li> + <li> <a href="https://groups.google.com/d/forum/joshua_support">Archive</a> </li> + </ul> + </div> + + <div class="footer"> + Last updated on April 08, 2016 + </div> + + </div> + + <div id="main"> + <div id="title"> + <h1>Z-MERT</h1> + </div> + + <div id="content"> + + <p>This document describes how to manually run the ZMERT module. ZMERT is Joshuaâs minimum error-rate +training module, written by Omar F. Zaidan. It is easily adapted to drop in different decoders, and +was also written so as to work with different objective functions (other than BLEU).</p> + +<p>((Section (1) in <code class="highlighter-rouge">$JOSHUA/examples/ZMERT/README_ZMERT.txt</code> is an expanded version of this section))</p> + +<p>Z-MERT, can be used by launching the driver program (<code class="highlighter-rouge">ZMERT.java</code>), which expects a config file as +its main argument. This config file can be used to specify any subset of Z-MERTâs 20-some +parameters. For a full list of those parameters, and their default values, run ZMERT with a single +-h argument as follows:</p> + +<div class="highlighter-rouge"><pre class="highlight"><code>java -cp $JOSHUA/bin joshua.zmert.ZMERT -h +</code></pre> +</div> + +<p>So what does a Z-MERT config file look like?</p> + +<p>Examine the file <code class="highlighter-rouge">examples/ZMERT/ZMERT_config_ex2.txt</code>. You will find that it +specifies the following âmainâ MERT parameters:</p> + +<div class="highlighter-rouge"><pre class="highlight"><code>(*) -dir dirPrefix: working directory +(*) -s sourceFile: source sentences (foreign sentences) of the MERT dataset +(*) -r refFile: target sentences (reference translations) of the MERT dataset +(*) -rps refsPerSen: number of reference translations per sentence +(*) -p paramsFile: file containing parameter names, initial values, and ranges +(*) -maxIt maxMERTIts: maximum number of MERT iterations +(*) -ipi initsPerIt: number of intermediate initial points per iteration +(*) -cmd commandFile: name of file containing commands to run the decoder +(*) -decOut decoderOutFile: name of the output file produced by the decoder +(*) -dcfg decConfigFile: name of decoder config file +(*) -N N: size of N-best list (per sentence) generated in each MERT iteration +(*) -v verbosity: output verbosity level (0-2; higher value => more verbose) +(*) -seed seed: seed used to initialize the random number generator +</code></pre> +</div> + +<p>(Note that the <code class="highlighter-rouge">-s</code> parameter is only used if Z-MERT is running Joshua as an + internal decoder. If Joshua is run as an external decoder, as is the case in + this README, then this parameter is ignored.)</p> + +<p>To test Z-MERT on the 100-sentence test set of example2, provide this config +file to Z-MERT as follows:</p> + +<div class="highlighter-rouge"><pre class="highlight"><code>java -cp bin joshua.zmert.ZMERT -maxMem 500 examples/ZMERT/ZMERT_config_ex2.txt > examples/ZMERT/ZMERT_example/ZMERT.out +</code></pre> +</div> + +<p>This will run Z-MERT for a couple of iterations on the data from the example2 +folder. (Notice that we have made copies of the source and reference files +from example2 and renamed them as src.txt and ref.* in the MERT_example folder, +just to have all the files needed by Z-MERT in one place.) Once the Z-MERT run +is complete, you should be able to inspect the log file to see what kinds of +things it did. If everything goes well, the run should take a few minutes, of +which more than 95% is time spent by Z-MERT waiting on Joshua to finish +decoding the sentences (once per iteration).</p> + +<p>The output file you get should be equivalent to <code class="highlighter-rouge">ZMERT.out.verbosity1</code>. If you +rerun the experiment with the verbosity (-v) argument set to 2 instead of 1, +the output file you get should be equivalent to <code class="highlighter-rouge">ZMERT.out.verbosity2</code>, which has +more interesting details about what Z-MERT does.</p> + +<p>Notice the additional <code class="highlighter-rouge">-maxMem</code> argument. It tells Z-MERT that it should not +persist to use up memory while the decoder is running (during which time Z-MERT +would be idle). The 500 tells Z-MERT that it can only use a maximum of 500 MB. +For more details on this issue, see section (4) in Z-MERTâs README.</p> + +<p>A quick note about Z-MERTâs interaction with the decoder. If you examine the +file <code class="highlighter-rouge">decoder_command_ex2.txt</code>, which is provided as the commandFile (<code class="highlighter-rouge">-cmd</code>) +argument in Z-MERTâs config file, youâll find it contains the command one would +use to run the decoder. Z-MERT launches the commandFile as an external +process, and assumes that it will launch the decoder to produce translations. +(Make sure that commandFile is executable.) After launching this external +process, Z-MERT waits for it to finish, then uses the resulting output file for +parameter tuning (in addition to the output files from previous iterations). +The command file here only has a single command, but your command file could +have multiple lines. Just make sure the command file itself is executable.</p> + +<p>Notice that the Z-MERT arguments <code class="highlighter-rouge">configFile</code> and <code class="highlighter-rouge">decoderOutFile</code> (<code class="highlighter-rouge">-cfg</code> and +<code class="highlighter-rouge">-decOut</code>) must match the two Joshua arguments in the commandFileâs (<code class="highlighter-rouge">-cmd</code>) single +command. Also, the Z-MERT argument for N must match the value for <code class="highlighter-rouge">top_n</code> in +Joshuaâs config file, indicated by the Z-MERT argument configFile (<code class="highlighter-rouge">-cfg</code>).</p> + +<p>For more details on Z-MERT, refer to <code class="highlighter-rouge">$JOSHUA/examples/ZMERT/README_ZMERT.txt</code></p> + + + </div> + </div> + + </body> +</html> + + + + + http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/zmert.md ---------------------------------------------------------------------- diff --git a/4.0/zmert.md b/4.0/zmert.md deleted file mode 100644 index 538a2ac..0000000 --- a/4.0/zmert.md +++ /dev/null @@ -1,83 +0,0 @@ ---- -layout: default4 -category: advanced -title: Z-MERT ---- - -This document describes how to manually run the ZMERT module. ZMERT is Joshua's minimum error-rate -training module, written by Omar F. Zaidan. It is easily adapted to drop in different decoders, and -was also written so as to work with different objective functions (other than BLEU). - -((Section (1) in `$JOSHUA/examples/ZMERT/README_ZMERT.txt` is an expanded version of this section)) - -Z-MERT, can be used by launching the driver program (`ZMERT.java`), which expects a config file as -its main argument. This config file can be used to specify any subset of Z-MERT's 20-some -parameters. For a full list of those parameters, and their default values, run ZMERT with a single --h argument as follows: - - java -cp $JOSHUA/bin joshua.zmert.ZMERT -h - -So what does a Z-MERT config file look like? - -Examine the file `examples/ZMERT/ZMERT_config_ex2.txt`. You will find that it -specifies the following "main" MERT parameters: - - (*) -dir dirPrefix: working directory - (*) -s sourceFile: source sentences (foreign sentences) of the MERT dataset - (*) -r refFile: target sentences (reference translations) of the MERT dataset - (*) -rps refsPerSen: number of reference translations per sentence - (*) -p paramsFile: file containing parameter names, initial values, and ranges - (*) -maxIt maxMERTIts: maximum number of MERT iterations - (*) -ipi initsPerIt: number of intermediate initial points per iteration - (*) -cmd commandFile: name of file containing commands to run the decoder - (*) -decOut decoderOutFile: name of the output file produced by the decoder - (*) -dcfg decConfigFile: name of decoder config file - (*) -N N: size of N-best list (per sentence) generated in each MERT iteration - (*) -v verbosity: output verbosity level (0-2; higher value => more verbose) - (*) -seed seed: seed used to initialize the random number generator - -(Note that the `-s` parameter is only used if Z-MERT is running Joshua as an - internal decoder. If Joshua is run as an external decoder, as is the case in - this README, then this parameter is ignored.) - -To test Z-MERT on the 100-sentence test set of example2, provide this config -file to Z-MERT as follows: - - java -cp bin joshua.zmert.ZMERT -maxMem 500 examples/ZMERT/ZMERT_config_ex2.txt > examples/ZMERT/ZMERT_example/ZMERT.out - -This will run Z-MERT for a couple of iterations on the data from the example2 -folder. (Notice that we have made copies of the source and reference files -from example2 and renamed them as src.txt and ref.* in the MERT_example folder, -just to have all the files needed by Z-MERT in one place.) Once the Z-MERT run -is complete, you should be able to inspect the log file to see what kinds of -things it did. If everything goes well, the run should take a few minutes, of -which more than 95% is time spent by Z-MERT waiting on Joshua to finish -decoding the sentences (once per iteration). - -The output file you get should be equivalent to `ZMERT.out.verbosity1`. If you -rerun the experiment with the verbosity (-v) argument set to 2 instead of 1, -the output file you get should be equivalent to `ZMERT.out.verbosity2`, which has -more interesting details about what Z-MERT does. - -Notice the additional `-maxMem` argument. It tells Z-MERT that it should not -persist to use up memory while the decoder is running (during which time Z-MERT -would be idle). The 500 tells Z-MERT that it can only use a maximum of 500 MB. -For more details on this issue, see section (4) in Z-MERT's README. - -A quick note about Z-MERT's interaction with the decoder. If you examine the -file `decoder_command_ex2.txt`, which is provided as the commandFile (`-cmd`) -argument in Z-MERT's config file, you'll find it contains the command one would -use to run the decoder. Z-MERT launches the commandFile as an external -process, and assumes that it will launch the decoder to produce translations. -(Make sure that commandFile is executable.) After launching this external -process, Z-MERT waits for it to finish, then uses the resulting output file for -parameter tuning (in addition to the output files from previous iterations). -The command file here only has a single command, but your command file could -have multiple lines. Just make sure the command file itself is executable. - -Notice that the Z-MERT arguments `configFile` and `decoderOutFile` (`-cfg` and -`-decOut`) must match the two Joshua arguments in the commandFile's (`-cmd`) single -command. Also, the Z-MERT argument for N must match the value for `top_n` in -Joshua's config file, indicated by the Z-MERT argument configFile (`-cfg`). - -For more details on Z-MERT, refer to `$JOSHUA/examples/ZMERT/README_ZMERT.txt` http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/5.0/advanced.html ---------------------------------------------------------------------- diff --git a/5.0/advanced.html b/5.0/advanced.html new file mode 100644 index 0000000..ad963e7 --- /dev/null +++ b/5.0/advanced.html @@ -0,0 +1,170 @@ +<!DOCTYPE html> +<html lang="en"> + <head> + <meta charset="utf-8"> + <title>Joshua Documentation | Advanced features</title> + <meta name="viewport" content="width=device-width, initial-scale=1.0"> + <meta name="description" content=""> + <meta name="author" content=""> + + <!-- Le styles --> + <link href="/bootstrap/css/bootstrap.css" rel="stylesheet"> + <style> + body { + padding-top: 60px; /* 60px to make the container go all the way to the bottom of the topbar */ + } + #download { + background-color: green; + font-size: 14pt; + font-weight: bold; + text-align: center; + color: white; + border-radius: 5px; + padding: 4px; + } + + #download a:link { + color: white; + } + + #download a:hover { + color: lightgrey; + } + + #download a:visited { + color: white; + } + + a.pdf { + font-variant: small-caps; + /* font-weight: bold; */ + font-size: 10pt; + color: white; + background: brown; + padding: 2px; + } + + a.bibtex { + font-variant: small-caps; + /* font-weight: bold; */ + font-size: 10pt; + color: white; + background: orange; + padding: 2px; + } + + img.sponsor { + height: 120px; + margin: 5px; + } + </style> + <link href="bootstrap/css/bootstrap-responsive.css" rel="stylesheet"> + + <!-- HTML5 shim, for IE6-8 support of HTML5 elements --> + <!--[if lt IE 9]> + <script src="bootstrap/js/html5shiv.js"></script> + <![endif]--> + + <!-- Fav and touch icons --> + <link rel="apple-touch-icon-precomposed" sizes="144x144" href="bootstrap/ico/apple-touch-icon-144-precomposed.png"> + <link rel="apple-touch-icon-precomposed" sizes="114x114" href="bootstrap/ico/apple-touch-icon-114-precomposed.png"> + <link rel="apple-touch-icon-precomposed" sizes="72x72" href="bootstrap/ico/apple-touch-icon-72-precomposed.png"> + <link rel="apple-touch-icon-precomposed" href="bootstrap/ico/apple-touch-icon-57-precomposed.png"> + <link rel="shortcut icon" href="bootstrap/ico/favicon.png"> + </head> + + <body> + + <div class="navbar navbar-inverse navbar-fixed-top"> + <div class="navbar-inner"> + <div class="container"> + <button type="button" class="btn btn-navbar" data-toggle="collapse" data-target=".nav-collapse"> + <span class="icon-bar"></span> + <span class="icon-bar"></span> + <span class="icon-bar"></span> + </button> + <a class="brand" href="/">Joshua</a> + <div class="nav-collapse collapse"> + <ul class="nav"> + <li><a href="index.html">Documentation</a></li> + <li><a href="pipeline.html">Pipeline</a></li> + <li><a href="tutorial.html">Tutorial</a></li> + <li><a href="decoder.html">Decoder</a></li> + <li><a href="thrax.html">Thrax</a></li> + <li><a href="file-formats.html">File formats</a></li> + <!-- <li><a href="advanced.html">Advanced</a></li> --> + <li><a href="faq.html">FAQ</a></li> + </ul> + </div><!--/.nav-collapse --> + </div> + </div> + </div> + + <div class="container"> + + <div class="row"> + <div class="span2"> + <img src="/images/joshua-logo-small.png" + alt="Joshua logo (picture of a Joshua tree)" /> + </div> + <div class="span10"> + <h1>Joshua Documentation</h1> + <h2>Advanced features</h2> + <span id="download"> + <a href="http://cs.jhu.edu/~post/files/joshua-v5.0.tgz">Download</a> + </span> + (version 5.0, released 16 August 2013) + </div> + </div> + + <hr /> + + <div class="row"> + <div class="span8"> + + + + + </div> + </div> + </div> <!-- /container --> + + <!-- Le javascript + ================================================== --> + <!-- Placed at the end of the document so the pages load faster --> + <script src="bootstrap/js/jquery.js"></script> + <script src="bootstrap/js/bootstrap-transition.js"></script> + <script src="bootstrap/js/bootstrap-alert.js"></script> + <script src="bootstrap/js/bootstrap-modal.js"></script> + <script src="bootstrap/js/bootstrap-dropdown.js"></script> + <script src="bootstrap/js/bootstrap-scrollspy.js"></script> + <script src="bootstrap/js/bootstrap-tab.js"></script> + <script src="bootstrap/js/bootstrap-tooltip.js"></script> + <script src="bootstrap/js/bootstrap-popover.js"></script> + <script src="bootstrap/js/bootstrap-button.js"></script> + <script src="bootstrap/js/bootstrap-collapse.js"></script> + <script src="bootstrap/js/bootstrap-carousel.js"></script> + <script src="bootstrap/js/bootstrap-typeahead.js"></script> + + <!-- Start of StatCounter Code for Default Guide --> + <script type="text/javascript"> + var sc_project=8264132; + var sc_invisible=1; + var sc_security="4b97fe2d"; + </script> + <script type="text/javascript" src="http://www.statcounter.com/counter/counter.js"></script> + <noscript> + <div class="statcounter"> + <a title="hit counter joomla" + href="http://statcounter.com/joomla/" + target="_blank"> + <img class="statcounter" + src="http://c.statcounter.com/8264132/0/4b97fe2d/1/" + alt="hit counter joomla" /> + </a> + </div> + </noscript> + <!-- End of StatCounter Code for Default Guide --> + + </body> +</html> http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/5.0/advanced.md ---------------------------------------------------------------------- diff --git a/5.0/advanced.md b/5.0/advanced.md deleted file mode 100644 index 174041e..0000000 --- a/5.0/advanced.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -layout: default -category: links -title: Advanced features ---- - - http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/5.0/bundle.html ---------------------------------------------------------------------- diff --git a/5.0/bundle.html b/5.0/bundle.html new file mode 100644 index 0000000..4977a8f --- /dev/null +++ b/5.0/bundle.html @@ -0,0 +1,189 @@ +<!DOCTYPE html> +<html lang="en"> + <head> + <meta charset="utf-8"> + <title>Joshua Documentation | Bundling a configuration</title> + <meta name="viewport" content="width=device-width, initial-scale=1.0"> + <meta name="description" content=""> + <meta name="author" content=""> + + <!-- Le styles --> + <link href="/bootstrap/css/bootstrap.css" rel="stylesheet"> + <style> + body { + padding-top: 60px; /* 60px to make the container go all the way to the bottom of the topbar */ + } + #download { + background-color: green; + font-size: 14pt; + font-weight: bold; + text-align: center; + color: white; + border-radius: 5px; + padding: 4px; + } + + #download a:link { + color: white; + } + + #download a:hover { + color: lightgrey; + } + + #download a:visited { + color: white; + } + + a.pdf { + font-variant: small-caps; + /* font-weight: bold; */ + font-size: 10pt; + color: white; + background: brown; + padding: 2px; + } + + a.bibtex { + font-variant: small-caps; + /* font-weight: bold; */ + font-size: 10pt; + color: white; + background: orange; + padding: 2px; + } + + img.sponsor { + height: 120px; + margin: 5px; + } + </style> + <link href="bootstrap/css/bootstrap-responsive.css" rel="stylesheet"> + + <!-- HTML5 shim, for IE6-8 support of HTML5 elements --> + <!--[if lt IE 9]> + <script src="bootstrap/js/html5shiv.js"></script> + <![endif]--> + + <!-- Fav and touch icons --> + <link rel="apple-touch-icon-precomposed" sizes="144x144" href="bootstrap/ico/apple-touch-icon-144-precomposed.png"> + <link rel="apple-touch-icon-precomposed" sizes="114x114" href="bootstrap/ico/apple-touch-icon-114-precomposed.png"> + <link rel="apple-touch-icon-precomposed" sizes="72x72" href="bootstrap/ico/apple-touch-icon-72-precomposed.png"> + <link rel="apple-touch-icon-precomposed" href="bootstrap/ico/apple-touch-icon-57-precomposed.png"> + <link rel="shortcut icon" href="bootstrap/ico/favicon.png"> + </head> + + <body> + + <div class="navbar navbar-inverse navbar-fixed-top"> + <div class="navbar-inner"> + <div class="container"> + <button type="button" class="btn btn-navbar" data-toggle="collapse" data-target=".nav-collapse"> + <span class="icon-bar"></span> + <span class="icon-bar"></span> + <span class="icon-bar"></span> + </button> + <a class="brand" href="/">Joshua</a> + <div class="nav-collapse collapse"> + <ul class="nav"> + <li><a href="index.html">Documentation</a></li> + <li><a href="pipeline.html">Pipeline</a></li> + <li><a href="tutorial.html">Tutorial</a></li> + <li><a href="decoder.html">Decoder</a></li> + <li><a href="thrax.html">Thrax</a></li> + <li><a href="file-formats.html">File formats</a></li> + <!-- <li><a href="advanced.html">Advanced</a></li> --> + <li><a href="faq.html">FAQ</a></li> + </ul> + </div><!--/.nav-collapse --> + </div> + </div> + </div> + + <div class="container"> + + <div class="row"> + <div class="span2"> + <img src="/images/joshua-logo-small.png" + alt="Joshua logo (picture of a Joshua tree)" /> + </div> + <div class="span10"> + <h1>Joshua Documentation</h1> + <h2>Bundling a configuration</h2> + <span id="download"> + <a href="http://cs.jhu.edu/~post/files/joshua-v5.0.tgz">Download</a> + </span> + (version 5.0, released 16 August 2013) + </div> + </div> + + <hr /> + + <div class="row"> + <div class="span8"> + + <p>A <em>bundled configuration</em> is a minimal set of configuration, resource, and script files. A script, <code class="highlighter-rouge">$JOSHUA/scripts/support/run-bundler.py</code> can be used to package up the run bundle. The resulting bundle can easily be transferred and shared.</p> + +<p><strong>Example invocation:</strong></p> + +<div class="highlighter-rouge"><pre class="highlight"><code>./run-bundler.py \ + --force \ + /path/to/rundir/runs/5/test/1/joshua.config \ + /path/to/rundir/runs/5 \ + bundled-configurations \ + "-top-n 1 \ + -output-format %S \ + -mark-oovs false \ + -server-port 5674 \ + -tm/pt "thrax pt 20 /path/to/rundir/runs/5/test/1/grammar.gz" +</code></pre> +</div> + +<p>A new directory <code class="highlighter-rouge">./bundled-configurations</code> will be created, and all the bundled files will be copied or created in it. To use the configuration with Joshua, run the executable file <code class="highlighter-rouge">./bundled-configurations/bundle-runner.sh</code>.</p> + +<p>Note, the additional options between the pair of quotation marks are passed as arguments to the <code class="highlighter-rouge">$JOSHUA/scripts/copy-config.pl</code> script. That script has some special parameters, especially the <code class="highlighter-rouge">-tm/..</code> option.</p> + + + </div> + </div> + </div> <!-- /container --> + + <!-- Le javascript + ================================================== --> + <!-- Placed at the end of the document so the pages load faster --> + <script src="bootstrap/js/jquery.js"></script> + <script src="bootstrap/js/bootstrap-transition.js"></script> + <script src="bootstrap/js/bootstrap-alert.js"></script> + <script src="bootstrap/js/bootstrap-modal.js"></script> + <script src="bootstrap/js/bootstrap-dropdown.js"></script> + <script src="bootstrap/js/bootstrap-scrollspy.js"></script> + <script src="bootstrap/js/bootstrap-tab.js"></script> + <script src="bootstrap/js/bootstrap-tooltip.js"></script> + <script src="bootstrap/js/bootstrap-popover.js"></script> + <script src="bootstrap/js/bootstrap-button.js"></script> + <script src="bootstrap/js/bootstrap-collapse.js"></script> + <script src="bootstrap/js/bootstrap-carousel.js"></script> + <script src="bootstrap/js/bootstrap-typeahead.js"></script> + + <!-- Start of StatCounter Code for Default Guide --> + <script type="text/javascript"> + var sc_project=8264132; + var sc_invisible=1; + var sc_security="4b97fe2d"; + </script> + <script type="text/javascript" src="http://www.statcounter.com/counter/counter.js"></script> + <noscript> + <div class="statcounter"> + <a title="hit counter joomla" + href="http://statcounter.com/joomla/" + target="_blank"> + <img class="statcounter" + src="http://c.statcounter.com/8264132/0/4b97fe2d/1/" + alt="hit counter joomla" /> + </a> + </div> + </noscript> + <!-- End of StatCounter Code for Default Guide --> + + </body> +</html> http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/5.0/bundle.md ---------------------------------------------------------------------- diff --git a/5.0/bundle.md b/5.0/bundle.md deleted file mode 100644 index c3874ab..0000000 --- a/5.0/bundle.md +++ /dev/null @@ -1,24 +0,0 @@ ---- -layout: default -category: links -title: Bundling a configuration ---- - -A *bundled configuration* is a minimal set of configuration, resource, and script files. A script, `$JOSHUA/scripts/support/run-bundler.py` can be used to package up the run bundle. The resulting bundle can easily be transferred and shared. - -**Example invocation:** - - ./run-bundler.py \ - --force \ - /path/to/rundir/runs/5/test/1/joshua.config \ - /path/to/rundir/runs/5 \ - bundled-configurations \ - "-top-n 1 \ - -output-format %S \ - -mark-oovs false \ - -server-port 5674 \ - -tm/pt "thrax pt 20 /path/to/rundir/runs/5/test/1/grammar.gz" - -A new directory `./bundled-configurations` will be created, and all the bundled files will be copied or created in it. To use the configuration with Joshua, run the executable file `./bundled-configurations/bundle-runner.sh`. - -Note, the additional options between the pair of quotation marks are passed as arguments to the `$JOSHUA/scripts/copy-config.pl` script. That script has some special parameters, especially the `-tm/..` option.
