http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/features.html ---------------------------------------------------------------------- diff --git a/4.0/features.html b/4.0/features.html new file mode 100644 index 0000000..49dcecc --- /dev/null +++ b/4.0/features.html @@ -0,0 +1,257 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> + +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> + <head> + <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> + <link rel="stylesheet" type="text/css" media="screen,print" href="../joshua4.css" /> + <title>Joshua | Features</title> + </head> + + <body> + + <div id="navbar"> + <a href="http://joshua-decoder.org/"> + <img src="../images/joshua-logo-small.png" width="130px" + alt="Joshua logo (picture of a Joshua tree)" /> + </a> + + <p class="infobox"> + <b>Stable version</b><br /> + 4.1<br/><br/> + <b>Release date</b><br /> + 2013 January + </p> + +<!-- <div class="infobox"> --> +<!-- <b>AUTO LINKS</b><br/> --> +<!-- <ul> --> +<!-- --> +<!-- <li> Advanced features</li> --> +<!-- --> +<!-- <li> Advanced features</li> --> +<!-- --> +<!-- <li> Advanced features</li> --> +<!-- --> +<!-- <li> Building a language pack</li> --> +<!-- --> +<!-- <li> Building a language pack</li> --> +<!-- --> +<!-- <li> Bundling a configuration</li> --> +<!-- --> +<!-- <li> Contributors</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Frequently Asked Questions</li> --> +<!-- --> +<!-- <li> Common problems</li> --> +<!-- --> +<!-- <li> Frequently Asked Questions</li> --> +<!-- --> +<!-- <li> Common problems</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> Fisher and CALLHOME Spanish English Speech Translation Corpus</li> --> +<!-- --> +<!-- <li> Indian Languages Parallel Corpora</li> --> +<!-- --> +<!-- <li> Joshua 4.0 User Documentation</li> --> +<!-- --> +<!-- <li> Language packs</li> --> +<!-- --> +<!-- <li> Paraphrase Packs</li> --> +<!-- --> +<!-- <li> Joshua releases</li> --> +<!-- --> +<!-- <li> Support</li> --> +<!-- --> +<!-- <li> Getting Started</li> --> +<!-- --> +<!-- <li> Welcome to Joshua</li> --> +<!-- --> +<!-- <li> Joshua documentation</li> --> +<!-- --> +<!-- <li> Joshua documentation</li> --> +<!-- --> +<!-- <li> Installation</li> --> +<!-- --> +<!-- <li> Installation</li> --> +<!-- --> +<!-- <li> Alignment with Jacana</li> --> +<!-- --> +<!-- <li> Alignment with Jacana</li> --> +<!-- --> +<!-- <li> Alignment with Jacana</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Lattice decoding</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> Quick Start</li> --> +<!-- --> +<!-- <li> Quick Start</li> --> +<!-- --> +<!-- <li> Releases</li> --> +<!-- --> +<!-- <li> Server mode</li> --> +<!-- --> +<!-- <li> Server mode</li> --> +<!-- --> +<!-- <li> Server mode</li> --> +<!-- --> +<!-- <li> Installing and running the Joshua Decoder</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Pipeline tutorial</li> --> +<!-- --> +<!-- <li> Pipeline tutorial</li> --> +<!-- --> +<!-- <li> Pipeline tutorial</li> --> +<!-- --> +<!-- <li> What's New</li> --> +<!-- --> +<!-- <li> What's New</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- </ul> --> +<!-- </div> --> + + <div class="infobox"> + + <b>Links</b><br /> + <ul> + <li> <a href="../index.html">Main</a> </li> + <li> <a href="pipeline.html">Pipeline</a> </li> + <li> <a href="step-by-step-instructions.html">Manual walkthrough</a> </li> + <li> <a href="decoder.html">Decoder</a> </li> + <li> <a href="server.html">Decoder Server</a> </li> + <li> <a href="file-formats.html">File formats</a> </li> + <li> <a href="thrax.html">Grammar Extraction</a> </li> + <li> <a href="../releases.html">Releases</a> </li> + </ul> + </div> + + <div class="infobox"> + <b>Advanced</b><br /> + <ul> +<!-- <li> <a href="packing.html">Grammar packing</a> </li> --> + <li> <a href="large-lms.html">Building large LMs</a> </li> + <li> <a href="zmert.html">Running Z-MERT</a> </li> + <li> <a href="lattice.html">Lattices</a> </li> + <li> <a href="server.html">TCP/IP server</a> </li> + <li> <a href="bundle.html">Bundled configuration</a> </li> + </ul> + </div> + + <div class="infobox"> + <b>Help</b><br /> + <ul> + <li> <a href="faq.html">Answers</a> </li> + <li> <a href="https://groups.google.com/d/forum/joshua_support">Archive</a> </li> + </ul> + </div> + + <div class="footer"> + Last updated on April 08, 2016 + </div> + + </div> + + <div id="main"> + <div id="title"> + <h1>Features</h1> + </div> + + <div id="content"> + + <p>This file will contain information about the Joshua decoder features.</p> + + + </div> + </div> + + </body> +</html> + + + + +
http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/features.md ---------------------------------------------------------------------- diff --git a/4.0/features.md b/4.0/features.md deleted file mode 100644 index d915c82..0000000 --- a/4.0/features.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -layout: default4 -category: links -title: Features ---- - -This file will contain information about the Joshua decoder features. http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/file-formats.html ---------------------------------------------------------------------- diff --git a/4.0/file-formats.html b/4.0/file-formats.html new file mode 100644 index 0000000..73770c6 --- /dev/null +++ b/4.0/file-formats.html @@ -0,0 +1,341 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> + +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> + <head> + <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> + <link rel="stylesheet" type="text/css" media="screen,print" href="../joshua4.css" /> + <title>Joshua | Joshua file formats</title> + </head> + + <body> + + <div id="navbar"> + <a href="http://joshua-decoder.org/"> + <img src="../images/joshua-logo-small.png" width="130px" + alt="Joshua logo (picture of a Joshua tree)" /> + </a> + + <p class="infobox"> + <b>Stable version</b><br /> + 4.1<br/><br/> + <b>Release date</b><br /> + 2013 January + </p> + +<!-- <div class="infobox"> --> +<!-- <b>AUTO LINKS</b><br/> --> +<!-- <ul> --> +<!-- --> +<!-- <li> Advanced features</li> --> +<!-- --> +<!-- <li> Advanced features</li> --> +<!-- --> +<!-- <li> Advanced features</li> --> +<!-- --> +<!-- <li> Building a language pack</li> --> +<!-- --> +<!-- <li> Building a language pack</li> --> +<!-- --> +<!-- <li> Bundling a configuration</li> --> +<!-- --> +<!-- <li> Contributors</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Frequently Asked Questions</li> --> +<!-- --> +<!-- <li> Common problems</li> --> +<!-- --> +<!-- <li> Frequently Asked Questions</li> --> +<!-- --> +<!-- <li> Common problems</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> Fisher and CALLHOME Spanish English Speech Translation Corpus</li> --> +<!-- --> +<!-- <li> Indian Languages Parallel Corpora</li> --> +<!-- --> +<!-- <li> Joshua 4.0 User Documentation</li> --> +<!-- --> +<!-- <li> Language packs</li> --> +<!-- --> +<!-- <li> Paraphrase Packs</li> --> +<!-- --> +<!-- <li> Joshua releases</li> --> +<!-- --> +<!-- <li> Support</li> --> +<!-- --> +<!-- <li> Getting Started</li> --> +<!-- --> +<!-- <li> Welcome to Joshua</li> --> +<!-- --> +<!-- <li> Joshua documentation</li> --> +<!-- --> +<!-- <li> Joshua documentation</li> --> +<!-- --> +<!-- <li> Installation</li> --> +<!-- --> +<!-- <li> Installation</li> --> +<!-- --> +<!-- <li> Alignment with Jacana</li> --> +<!-- --> +<!-- <li> Alignment with Jacana</li> --> +<!-- --> +<!-- <li> Alignment with Jacana</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Lattice decoding</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> Quick Start</li> --> +<!-- --> +<!-- <li> Quick Start</li> --> +<!-- --> +<!-- <li> Releases</li> --> +<!-- --> +<!-- <li> Server mode</li> --> +<!-- --> +<!-- <li> Server mode</li> --> +<!-- --> +<!-- <li> Server mode</li> --> +<!-- --> +<!-- <li> Installing and running the Joshua Decoder</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Pipeline tutorial</li> --> +<!-- --> +<!-- <li> Pipeline tutorial</li> --> +<!-- --> +<!-- <li> Pipeline tutorial</li> --> +<!-- --> +<!-- <li> What's New</li> --> +<!-- --> +<!-- <li> What's New</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- </ul> --> +<!-- </div> --> + + <div class="infobox"> + + <b>Links</b><br /> + <ul> + <li> <a href="../index.html">Main</a> </li> + <li> <a href="pipeline.html">Pipeline</a> </li> + <li> <a href="step-by-step-instructions.html">Manual walkthrough</a> </li> + <li> <a href="decoder.html">Decoder</a> </li> + <li> <a href="server.html">Decoder Server</a> </li> + <li> <a href="file-formats.html">File formats</a> </li> + <li> <a href="thrax.html">Grammar Extraction</a> </li> + <li> <a href="../releases.html">Releases</a> </li> + </ul> + </div> + + <div class="infobox"> + <b>Advanced</b><br /> + <ul> +<!-- <li> <a href="packing.html">Grammar packing</a> </li> --> + <li> <a href="large-lms.html">Building large LMs</a> </li> + <li> <a href="zmert.html">Running Z-MERT</a> </li> + <li> <a href="lattice.html">Lattices</a> </li> + <li> <a href="server.html">TCP/IP server</a> </li> + <li> <a href="bundle.html">Bundled configuration</a> </li> + </ul> + </div> + + <div class="infobox"> + <b>Help</b><br /> + <ul> + <li> <a href="faq.html">Answers</a> </li> + <li> <a href="https://groups.google.com/d/forum/joshua_support">Archive</a> </li> + </ul> + </div> + + <div class="footer"> + Last updated on April 08, 2016 + </div> + + </div> + + <div id="main"> + <div id="title"> + <h1>Joshua file formats</h1> + </div> + + <div id="content"> + + <p>This page describes the formats of Joshua configuration and support files.</p> + +<h2 id="translation-models-grammars">Translation models (grammars)</h2> + +<p>Joshua supports three grammar file formats.</p> + +<ol> + <li>Thrax / Hiero</li> + <li>SAMT [deprecated]</li> + <li>packed</li> +</ol> + +<p>The <em>Hiero</em> format is not restricted to Hiero grammars, but simply means <em>the format that David +Chiang developed for Hiero</em>. It can support a much broader class of SCFGs containing an arbitrary +set of nonterminals. Similarly, the <em>SAMT</em> format is not restricted to SAMT grammars but instead +simply denotes <em>the grammar format that Zollmann and Venugopal developed for their decoder</em>. To +remove this source of confusion, âthraxâ is the preferred format designation, and is in fact the +default.</p> + +<p>The packed grammar format is the efficient grammar representation developed by +<a href="http://cs.jhu.edu/~juri">Juri Ganitkevich</a> <a href="packing.html">is described in detail elsewhere</a>.</p> + +<p>Grammar rules in the Thrax format follow this format:</p> + +<div class="highlighter-rouge"><pre class="highlight"><code>[LHS] ||| SOURCE-SIDE ||| TARGET-SIDE ||| FEATURES +</code></pre> +</div> + +<p>Here are some two examples, one for a Hiero grammar, and the other for an SAMT grammar:</p> + +<div class="highlighter-rouge"><pre class="highlight"><code>[X] ||| el chico [X] ||| the boy [X] ||| -3.14 0 2 17 +[S] ||| el chico [VP] ||| the boy [VP] ||| -3.14 0 2 17 +</code></pre> +</div> + +<p>The feature values can have optional labels, e.g.:</p> + +<div class="highlighter-rouge"><pre class="highlight"><code>[X] ||| el chico [X] ||| the boy [X] ||| lexprob=-3.14 abstract=0 numwords=2 count=17 +</code></pre> +</div> + +<p>These feature names are made up. For an actual list of feature names, please +<a href="thrax.html">see the Thrax documentation</a>.</p> + +<p>The SAMT grammar format is deprecated and undocumented.</p> + +<h2 id="language-model">Language Model</h2> + +<p>Joshua has three language model implementations: <a href="">KenLM</a>, <a href="">BerkeleyLM</a>, and an (unrecommended) +dummy Java implementation. All language model implementations support the standard ARPA format +output by <a href="">SRILM</a>. In addition, KenLM and BerkeleyLM support compiled formats that can be loaded +more quickly and efficiently.</p> + +<h3 id="compiling-for-kenlm">Compiling for KenLM</h3> + +<p>To compile an ARPA grammar for KenLM, use the (provided) <code class="highlighter-rouge">build-binary</code> command, located deep within +the Joshua source code:</p> + +<div class="highlighter-rouge"><pre class="highlight"><code>$JOSHUA/src/joshua/decoder/ff/lm/kenlm/build_binary lm.arpa lm.kenlm +</code></pre> +</div> + +<p>This script takes the <code class="highlighter-rouge">lm.arpa</code> file and produces the compiled version in <code class="highlighter-rouge">lm.kenlm</code>.</p> + +<h3 id="compiling-for-berkeleylm">Compiling for BerkeleyLM</h3> + +<p>To compile a grammar for BerkeleyLM, type:</p> + +<div class="highlighter-rouge"><pre class="highlight"><code>java -cp $JOSHUA/lib/berkeleylm.jar -server -mxMEM edu.berkeley.nlp.lm.io.MakeLmBinaryFromArpa lm.arpa lm.berkeleylm +</code></pre> +</div> + +<p>The <code class="highlighter-rouge">lm.berkeleylm</code> file can then be listed directly in the <a href="decoder.html">Joshua configuration file</a>.</p> + +<h2 id="joshua-configuration">Joshua configuration</h2> + +<p>See <a href="decoder.html">the decoder page</a>.</p> + +<h2 id="pipeline-configuration">Pipeline configuration</h2> + +<p>See <a href="pipeline.html">the pipeline page</a>.</p> + +<h2 id="thrax-configuration">Thrax configuration</h2> + +<p>See <a href="thrax.html">the thrax page</a>.</p> + + + </div> + </div> + + </body> +</html> + + + + + http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/file-formats.md ---------------------------------------------------------------------- diff --git a/4.0/file-formats.md b/4.0/file-formats.md deleted file mode 100644 index c10f906..0000000 --- a/4.0/file-formats.md +++ /dev/null @@ -1,78 +0,0 @@ ---- -layout: default4 -category: advanced -title: Joshua file formats ---- -This page describes the formats of Joshua configuration and support files. - -## Translation models (grammars) - -Joshua supports three grammar file formats. - -1. Thrax / Hiero -1. SAMT [deprecated] -1. packed - -The *Hiero* format is not restricted to Hiero grammars, but simply means *the format that David -Chiang developed for Hiero*. It can support a much broader class of SCFGs containing an arbitrary -set of nonterminals. Similarly, the *SAMT* format is not restricted to SAMT grammars but instead -simply denotes *the grammar format that Zollmann and Venugopal developed for their decoder*. To -remove this source of confusion, "thrax" is the preferred format designation, and is in fact the -default. - -The packed grammar format is the efficient grammar representation developed by -[Juri Ganitkevich](http://cs.jhu.edu/~juri) [is described in detail elsewhere](packing.html). - -Grammar rules in the Thrax format follow this format: - - [LHS] ||| SOURCE-SIDE ||| TARGET-SIDE ||| FEATURES - -Here are some two examples, one for a Hiero grammar, and the other for an SAMT grammar: - - [X] ||| el chico [X] ||| the boy [X] ||| -3.14 0 2 17 - [S] ||| el chico [VP] ||| the boy [VP] ||| -3.14 0 2 17 - -The feature values can have optional labels, e.g.: - - [X] ||| el chico [X] ||| the boy [X] ||| lexprob=-3.14 abstract=0 numwords=2 count=17 - -These feature names are made up. For an actual list of feature names, please -[see the Thrax documentation](thrax.html). - -The SAMT grammar format is deprecated and undocumented. - -## Language Model - -Joshua has three language model implementations: [KenLM](), [BerkeleyLM](), and an (unrecommended) -dummy Java implementation. All language model implementations support the standard ARPA format -output by [SRILM](). In addition, KenLM and BerkeleyLM support compiled formats that can be loaded -more quickly and efficiently. - -### Compiling for KenLM - -To compile an ARPA grammar for KenLM, use the (provided) `build-binary` command, located deep within -the Joshua source code: - - $JOSHUA/src/joshua/decoder/ff/lm/kenlm/build_binary lm.arpa lm.kenlm - -This script takes the `lm.arpa` file and produces the compiled version in `lm.kenlm`. - -### Compiling for BerkeleyLM - -To compile a grammar for BerkeleyLM, type: - - java -cp $JOSHUA/lib/berkeleylm.jar -server -mxMEM edu.berkeley.nlp.lm.io.MakeLmBinaryFromArpa lm.arpa lm.berkeleylm - -The `lm.berkeleylm` file can then be listed directly in the [Joshua configuration file](decoder.html). - -## Joshua configuration - -See [the decoder page](decoder.html). - -## Pipeline configuration - -See [the pipeline page](pipeline.html). - -## Thrax configuration - -See [the thrax page](thrax.html). http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/index.html ---------------------------------------------------------------------- diff --git a/4.0/index.html b/4.0/index.html new file mode 100644 index 0000000..216b006 --- /dev/null +++ b/4.0/index.html @@ -0,0 +1,309 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> + +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> + <head> + <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> + <link rel="stylesheet" type="text/css" media="screen,print" href="../joshua4.css" /> + <title>Joshua | Joshua 4.0 User Documentation</title> + </head> + + <body> + + <div id="navbar"> + <a href="http://joshua-decoder.org/"> + <img src="../images/joshua-logo-small.png" width="130px" + alt="Joshua logo (picture of a Joshua tree)" /> + </a> + + <p class="infobox"> + <b>Stable version</b><br /> + 4.1<br/><br/> + <b>Release date</b><br /> + 2013 January + </p> + +<!-- <div class="infobox"> --> +<!-- <b>AUTO LINKS</b><br/> --> +<!-- <ul> --> +<!-- --> +<!-- <li> Advanced features</li> --> +<!-- --> +<!-- <li> Advanced features</li> --> +<!-- --> +<!-- <li> Advanced features</li> --> +<!-- --> +<!-- <li> Building a language pack</li> --> +<!-- --> +<!-- <li> Building a language pack</li> --> +<!-- --> +<!-- <li> Bundling a configuration</li> --> +<!-- --> +<!-- <li> Contributors</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Frequently Asked Questions</li> --> +<!-- --> +<!-- <li> Common problems</li> --> +<!-- --> +<!-- <li> Frequently Asked Questions</li> --> +<!-- --> +<!-- <li> Common problems</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> Fisher and CALLHOME Spanish English Speech Translation Corpus</li> --> +<!-- --> +<!-- <li> Indian Languages Parallel Corpora</li> --> +<!-- --> +<!-- <li> Joshua 4.0 User Documentation</li> --> +<!-- --> +<!-- <li> Language packs</li> --> +<!-- --> +<!-- <li> Paraphrase Packs</li> --> +<!-- --> +<!-- <li> Joshua releases</li> --> +<!-- --> +<!-- <li> Support</li> --> +<!-- --> +<!-- <li> Getting Started</li> --> +<!-- --> +<!-- <li> Welcome to Joshua</li> --> +<!-- --> +<!-- <li> Joshua documentation</li> --> +<!-- --> +<!-- <li> Joshua documentation</li> --> +<!-- --> +<!-- <li> Installation</li> --> +<!-- --> +<!-- <li> Installation</li> --> +<!-- --> +<!-- <li> Alignment with Jacana</li> --> +<!-- --> +<!-- <li> Alignment with Jacana</li> --> +<!-- --> +<!-- <li> Alignment with Jacana</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Lattice decoding</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> Quick Start</li> --> +<!-- --> +<!-- <li> Quick Start</li> --> +<!-- --> +<!-- <li> Releases</li> --> +<!-- --> +<!-- <li> Server mode</li> --> +<!-- --> +<!-- <li> Server mode</li> --> +<!-- --> +<!-- <li> Server mode</li> --> +<!-- --> +<!-- <li> Installing and running the Joshua Decoder</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Pipeline tutorial</li> --> +<!-- --> +<!-- <li> Pipeline tutorial</li> --> +<!-- --> +<!-- <li> Pipeline tutorial</li> --> +<!-- --> +<!-- <li> What's New</li> --> +<!-- --> +<!-- <li> What's New</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- </ul> --> +<!-- </div> --> + + <div class="infobox"> + + <b>Links</b><br /> + <ul> + <li> <a href="../index.html">Main</a> </li> + <li> <a href="pipeline.html">Pipeline</a> </li> + <li> <a href="step-by-step-instructions.html">Manual walkthrough</a> </li> + <li> <a href="decoder.html">Decoder</a> </li> + <li> <a href="server.html">Decoder Server</a> </li> + <li> <a href="file-formats.html">File formats</a> </li> + <li> <a href="thrax.html">Grammar Extraction</a> </li> + <li> <a href="../releases.html">Releases</a> </li> + </ul> + </div> + + <div class="infobox"> + <b>Advanced</b><br /> + <ul> +<!-- <li> <a href="packing.html">Grammar packing</a> </li> --> + <li> <a href="large-lms.html">Building large LMs</a> </li> + <li> <a href="zmert.html">Running Z-MERT</a> </li> + <li> <a href="lattice.html">Lattices</a> </li> + <li> <a href="server.html">TCP/IP server</a> </li> + <li> <a href="bundle.html">Bundled configuration</a> </li> + </ul> + </div> + + <div class="infobox"> + <b>Help</b><br /> + <ul> + <li> <a href="faq.html">Answers</a> </li> + <li> <a href="https://groups.google.com/d/forum/joshua_support">Archive</a> </li> + </ul> + </div> + + <div class="footer"> + Last updated on April 08, 2016 + </div> + + </div> + + <div id="main"> + <div id="title"> + <h1>Joshua 4.0 User Documentation</h1> + </div> + + <div id="content"> + + <p>This page contains end-user oriented documentation for the 4.0 release of +<a href="http://joshua-decoder.org/">the Joshua decoder</a>.</p> + +<h2 id="download-and-setup">Download and Setup</h2> + +<ol> + <li> + <p>Follow <a href="http://cs.jhu.edu/~post/files/joshua-4.0.tgz">this link</a> to download Joshua, or do it +from the command line:</p> + + <div class="highlighter-rouge"><pre class="highlight"><code>wget -q http://cs.jhu.edu/~post/files/joshua-4.0.tgz +</code></pre> + </div> + </li> + <li> + <p>Next, unpack it, set the <code class="highlighter-rouge">$JOSHUA</code> environment variable, and compile everything:</p> + + <div class="highlighter-rouge"><pre class="highlight"><code>tar xzf joshua-4.0.tgz +cd joshua-4.0 + +# for bash +export JOSHUA=$(pwd) +echo "export JOSHUA=$JOSHUA" >> ~/.bashrc + +# for tcsh +setenv JOSHUA `pwd` +echo "setenv JOSHUA $JOSHUA" >> ~/.profile + +ant all +</code></pre> + </div> + </li> + <li> + <p>Thatâs it.</p> + </li> +</ol> + +<h2 id="quick-start">Quick start</h2> + +<p>If you just want to run the complete machine translation pipeline (beginning with data preparation, +through alignment, hierarchical model building, tuning, testing, and reporting), we recommend you +use our <a href="pipeline.html">pipeline script</a>. You might also be interested in +<a href="http://cs.jhu.edu/~ccb/joshua/">Chrisâ old walkthrough</a>.</p> + +<h2 id="more-information">More information</h2> + +<p>For more detail on the decoder itself, including its command-line options, see +<a href="decoder.html">the Joshua decoder page</a>. You can also learn more about other steps of +<a href="pipeline.html">the Joshua MT pipeline</a>, including <a href="thrax.html">grammar extraction</a> with Thrax and +Joshuaâs <a href="packing.html">efficient grammar representation</a> (new with version 4.0).</p> + +<p>If you have problems or issues, you might find some help <a href="faq.html">on our answers page</a> or +<a href="https://groups.google.com/forum/?fromgroups#!forum/joshua_support">in the mailing list archives</a>.</p> + + + </div> + </div> + + </body> +</html> + + + + + http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/index.md ---------------------------------------------------------------------- diff --git a/4.0/index.md b/4.0/index.md deleted file mode 100644 index ae62e4e..0000000 --- a/4.0/index.md +++ /dev/null @@ -1,48 +0,0 @@ ---- -layout: default4 -title: Joshua 4.0 User Documentation ---- - -This page contains end-user oriented documentation for the 4.0 release of -[the Joshua decoder](http://joshua-decoder.org/). - -## Download and Setup - -1. Follow [this link](http://cs.jhu.edu/~post/files/joshua-4.0.tgz) to download Joshua, or do it -from the command line: - - wget -q http://cs.jhu.edu/~post/files/joshua-4.0.tgz - -2. Next, unpack it, set the `$JOSHUA` environment variable, and compile everything: - - tar xzf joshua-4.0.tgz - cd joshua-4.0 - - # for bash - export JOSHUA=$(pwd) - echo "export JOSHUA=$JOSHUA" >> ~/.bashrc - - # for tcsh - setenv JOSHUA `pwd` - echo "setenv JOSHUA $JOSHUA" >> ~/.profile - - ant all - -3. That's it. - -## Quick start - -If you just want to run the complete machine translation pipeline (beginning with data preparation, -through alignment, hierarchical model building, tuning, testing, and reporting), we recommend you -use our <a href="pipeline.html">pipeline script</a>. You might also be interested in -[Chris' old walkthrough](http://cs.jhu.edu/~ccb/joshua/). - -## More information - -For more detail on the decoder itself, including its command-line options, see -[the Joshua decoder page](decoder.html). You can also learn more about other steps of -[the Joshua MT pipeline](pipeline.html), including [grammar extraction](thrax.html) with Thrax and -Joshua's [efficient grammar representation](packing.html) (new with version 4.0). - -If you have problems or issues, you might find some help [on our answers page](faq.html) or -[in the mailing list archives](https://groups.google.com/forum/?fromgroups#!forum/joshua_support). http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/large-lms.html ---------------------------------------------------------------------- diff --git a/4.0/large-lms.html b/4.0/large-lms.html new file mode 100644 index 0000000..d591057 --- /dev/null +++ b/4.0/large-lms.html @@ -0,0 +1,455 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> + +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> + <head> + <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> + <link rel="stylesheet" type="text/css" media="screen,print" href="../joshua4.css" /> + <title>Joshua | Building large LMs with SRILM</title> + </head> + + <body> + + <div id="navbar"> + <a href="http://joshua-decoder.org/"> + <img src="../images/joshua-logo-small.png" width="130px" + alt="Joshua logo (picture of a Joshua tree)" /> + </a> + + <p class="infobox"> + <b>Stable version</b><br /> + 4.1<br/><br/> + <b>Release date</b><br /> + 2013 January + </p> + +<!-- <div class="infobox"> --> +<!-- <b>AUTO LINKS</b><br/> --> +<!-- <ul> --> +<!-- --> +<!-- <li> Advanced features</li> --> +<!-- --> +<!-- <li> Advanced features</li> --> +<!-- --> +<!-- <li> Advanced features</li> --> +<!-- --> +<!-- <li> Building a language pack</li> --> +<!-- --> +<!-- <li> Building a language pack</li> --> +<!-- --> +<!-- <li> Bundling a configuration</li> --> +<!-- --> +<!-- <li> Contributors</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Frequently Asked Questions</li> --> +<!-- --> +<!-- <li> Common problems</li> --> +<!-- --> +<!-- <li> Frequently Asked Questions</li> --> +<!-- --> +<!-- <li> Common problems</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> Fisher and CALLHOME Spanish English Speech Translation Corpus</li> --> +<!-- --> +<!-- <li> Indian Languages Parallel Corpora</li> --> +<!-- --> +<!-- <li> Joshua 4.0 User Documentation</li> --> +<!-- --> +<!-- <li> Language packs</li> --> +<!-- --> +<!-- <li> Paraphrase Packs</li> --> +<!-- --> +<!-- <li> Joshua releases</li> --> +<!-- --> +<!-- <li> Support</li> --> +<!-- --> +<!-- <li> Getting Started</li> --> +<!-- --> +<!-- <li> Welcome to Joshua</li> --> +<!-- --> +<!-- <li> Joshua documentation</li> --> +<!-- --> +<!-- <li> Joshua documentation</li> --> +<!-- --> +<!-- <li> Installation</li> --> +<!-- --> +<!-- <li> Installation</li> --> +<!-- --> +<!-- <li> Alignment with Jacana</li> --> +<!-- --> +<!-- <li> Alignment with Jacana</li> --> +<!-- --> +<!-- <li> Alignment with Jacana</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Lattice decoding</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> Quick Start</li> --> +<!-- --> +<!-- <li> Quick Start</li> --> +<!-- --> +<!-- <li> Releases</li> --> +<!-- --> +<!-- <li> Server mode</li> --> +<!-- --> +<!-- <li> Server mode</li> --> +<!-- --> +<!-- <li> Server mode</li> --> +<!-- --> +<!-- <li> Installing and running the Joshua Decoder</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Pipeline tutorial</li> --> +<!-- --> +<!-- <li> Pipeline tutorial</li> --> +<!-- --> +<!-- <li> Pipeline tutorial</li> --> +<!-- --> +<!-- <li> What's New</li> --> +<!-- --> +<!-- <li> What's New</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- </ul> --> +<!-- </div> --> + + <div class="infobox"> + + <b>Links</b><br /> + <ul> + <li> <a href="../index.html">Main</a> </li> + <li> <a href="pipeline.html">Pipeline</a> </li> + <li> <a href="step-by-step-instructions.html">Manual walkthrough</a> </li> + <li> <a href="decoder.html">Decoder</a> </li> + <li> <a href="server.html">Decoder Server</a> </li> + <li> <a href="file-formats.html">File formats</a> </li> + <li> <a href="thrax.html">Grammar Extraction</a> </li> + <li> <a href="../releases.html">Releases</a> </li> + </ul> + </div> + + <div class="infobox"> + <b>Advanced</b><br /> + <ul> +<!-- <li> <a href="packing.html">Grammar packing</a> </li> --> + <li> <a href="large-lms.html">Building large LMs</a> </li> + <li> <a href="zmert.html">Running Z-MERT</a> </li> + <li> <a href="lattice.html">Lattices</a> </li> + <li> <a href="server.html">TCP/IP server</a> </li> + <li> <a href="bundle.html">Bundled configuration</a> </li> + </ul> + </div> + + <div class="infobox"> + <b>Help</b><br /> + <ul> + <li> <a href="faq.html">Answers</a> </li> + <li> <a href="https://groups.google.com/d/forum/joshua_support">Archive</a> </li> + </ul> + </div> + + <div class="footer"> + Last updated on April 08, 2016 + </div> + + </div> + + <div id="main"> + <div id="title"> + <h1>Building large LMs with SRILM</h1> + </div> + + <div id="content"> + + <p>The following is a tutorial for building a large language model from the +English Gigaword Fifth Edition corpus +<a href="http://www.ldc.upenn.edu/Catalog/catalogEntry.jsp?catalogId=LDC2011T07">LDC2011T07</a> +using SRILM. English text is provided from seven different sources.</p> + +<h3 id="step-0-clean-up-the-corpus">Step 0: Clean up the corpus</h3> + +<p>The Gigaword corpus has to be stripped of all SGML tags and tokenized. +Instructions for performing those steps are not included in this +documentation. A description of this process can be found in a paper +called <a href="https://akbcwekex2012.files.wordpress.com/2012/05/28_paper.pdf">âAnnotated +Gigawordâ</a>.</p> + +<p>The Joshua package ships with a script that converts all alphabetical +characters to their lowercase equivalent. The script is located at +<code class="highlighter-rouge">$JOSHUA/scripts/lowercase.perl</code>.</p> + +<p>Make a directory structure as follows:</p> + +<div class="highlighter-rouge"><pre class="highlight"><code>gigaword/ +âââ corpus/ +â  âââ afp_eng/ +â  â  âââ afp_eng_199405.lc.gz +â  â  âââ afp_eng_199406.lc.gz +â  â  âââ ... +â  â  âââ counts/ +â  âââ apw_eng/ +â  â  âââ apw_eng_199411.lc.gz +â  â  âââ apw_eng_199412.lc.gz +â  â  âââ ... +â  â  âââ counts/ +â  âââ cna_eng/ +â  â  âââ ... +â  â  âââ counts/ +â  âââ ltw_eng/ +â  â  âââ ... +â  â  âââ counts/ +â  âââ nyt_eng/ +â  â  âââ ... +â  â  âââ counts/ +â  âââ wpb_eng/ +â  â  âââ ... +â  â  âââ counts/ +â  âââ xin_eng/ +â    âââ ... +â    âââ counts/ +âââ lm/ +   âââ afp_eng/ +   âââ apw_eng/ +   âââ cna_eng/ +   âââ ltw_eng/ +   âââ nyt_eng/ +   âââ wpb_eng/ +   âââ xin_eng/ +</code></pre> +</div> + +<p>The next step will be to build smaller LMs and then interpolate them into one +file.</p> + +<h3 id="step-1-count-ngrams">Step 1: Count ngrams</h3> + +<p>Run the following script once from each source directory under the <code class="highlighter-rouge">corpus/</code> +directory (edit it to specify the path to the <code class="highlighter-rouge">ngram-count</code> binary as well as +the number of processors):</p> + +<div class="highlighter-rouge"><pre class="highlight"><code><span class="c">#!/bin/sh</span> + +<span class="nv">NGRAM_COUNT</span><span class="o">=</span><span class="nv">$SRILM_SRC</span>/bin/i686-m64/ngram-count +<span class="nv">args</span><span class="o">=</span><span class="s2">""</span> + +<span class="k">for </span><span class="nb">source </span><span class="k">in</span> <span class="k">*</span>.gz; <span class="k">do + </span><span class="nv">args</span><span class="o">=</span><span class="nv">$args</span><span class="s2">"-sort -order 5 -text </span><span class="nv">$source</span><span class="s2"> -write counts/</span><span class="nv">$source</span><span class="s2">-counts.gz "</span> +<span class="k">done + +</span><span class="nb">echo</span> <span class="nv">$args</span> | xargs --max-procs<span class="o">=</span>4 -n 7 <span class="nv">$NGRAM_COUNT</span> +</code></pre> +</div> + +<p>Then move each <code class="highlighter-rouge">counts/</code> directory to the corresponding directory under +<code class="highlighter-rouge">lm/</code>. Now that each ngram has been counted, we can make a language +model for each of the seven sources.</p> + +<h3 id="step-2-make-individual-language-models">Step 2: Make individual language models</h3> + +<p>SRILM includes a script, called <code class="highlighter-rouge">make-big-lm</code>, for building large language +models under resource-limited environments. The manual for this script can be +read online +<a href="http://www-speech.sri.com/projects/srilm/manpages/training-scripts.1.html">here</a>. +Since the Gigaword corpus is so large, it is convenient to use <code class="highlighter-rouge">make-big-lm</code> +even in environments with many parallel processors and a lot of memory.</p> + +<p>Initiate the following script from each of the source directories under the +<code class="highlighter-rouge">lm/</code> directory (edit it to specify the path to the <code class="highlighter-rouge">make-big-lm</code> script as +well as the pruning threshold):</p> + +<div class="highlighter-rouge"><pre class="highlight"><code><span class="c">#!/bin/bash</span> +<span class="nb">set</span> -x + +<span class="nv">CMD</span><span class="o">=</span><span class="nv">$SRILM_SRC</span>/bin/make-big-lm +<span class="nv">PRUNE_THRESHOLD</span><span class="o">=</span>1e-8 + +<span class="nv">$CMD</span> <span class="se">\</span> + -name gigalm <span class="sb">`</span><span class="k">for </span>k <span class="k">in </span>counts/<span class="k">*</span>.gz; <span class="k">do </span><span class="nb">echo</span> <span class="s2">" </span><span class="se">\</span><span class="s2"> + -read </span><span class="nv">$k</span><span class="s2"> "</span>; <span class="k">done</span><span class="sb">`</span> <span class="se">\</span> + -lm lm.gz <span class="se">\</span> + -max-per-file 100000000 <span class="se">\</span> + -order 5 <span class="se">\</span> + -kndiscount <span class="se">\</span> + -interpolate <span class="se">\</span> + -unk <span class="se">\</span> + -prune <span class="nv">$PRUNE_THRESHOLD</span> +</code></pre> +</div> + +<p>The language model attributes chosen are the following:</p> + +<ul> + <li>N-grams up to order 5</li> + <li>Kneser-Ney smoothing</li> + <li>N-gram probability estimates at the specified order <em>n</em> are interpolated with +lower-order estimates</li> + <li>include the unknown-word token as a regular word</li> + <li>pruning N-grams based on the specified threshold</li> +</ul> + +<p>Next, we will mix the models together into a single file.</p> + +<h3 id="step-3-mix-models-together">Step 3: Mix models together</h3> + +<p>Using development text, interpolation weights can determined that give highest +weight to the source language models that have the lowest perplexity on the +specified development set.</p> + +<h4 id="step-3-1-determine-interpolation-weights">Step 3-1: Determine interpolation weights</h4> + +<p>Initiate the following script from the <code class="highlighter-rouge">lm/</code> directory (edit it to specify the +path to the <code class="highlighter-rouge">ngram</code> binary as well as the path to the development text file):</p> + +<div class="highlighter-rouge"><pre class="highlight"><code><span class="c">#!/bin/bash</span> +<span class="nb">set</span> -x + +<span class="nv">NGRAM</span><span class="o">=</span><span class="nv">$SRILM_SRC</span>/bin/i686-m64/ngram +<span class="nv">DEV_TEXT</span><span class="o">=</span>~mpost/expts/wmt12/runs/es-en/data/tune/tune.tok.lc.es + +<span class="nb">dirs</span><span class="o">=(</span> afp_eng apw_eng cna_eng ltw_eng nyt_eng wpb_eng xin_eng <span class="o">)</span> + +<span class="k">for </span>d <span class="k">in</span> <span class="k">${</span><span class="nv">dirs</span><span class="p">[@]</span><span class="k">}</span> ; <span class="k">do</span> + <span class="nv">$NGRAM</span> -debug 2 -order 5 -unk -lm <span class="nv">$d</span>/lm.gz -ppl <span class="nv">$DEV_TEXT</span> > <span class="nv">$d</span>/lm.ppl ; +<span class="k">done + +</span>compute-best-mix <span class="k">*</span>/lm.ppl > best-mix.ppl +</code></pre> +</div> + +<p>Take a look at the contents of <code class="highlighter-rouge">best-mix.ppl</code>. It will contain a sequence of +values in parenthesis. These are the interpolation weights of the source +language models in the order specified. Copy and paste the values within the +parenthesis into the script below.</p> + +<h4 id="step-3-2-combine-the-models">Step 3-2: Combine the models</h4> + +<p>Initiate the following script from the <code class="highlighter-rouge">lm/</code> directory (edit it to specify the +path to the <code class="highlighter-rouge">ngram</code> binary as well as the interpolation weights):</p> + +<div class="highlighter-rouge"><pre class="highlight"><code><span class="c">#!/bin/bash</span> +<span class="nb">set</span> -x + +<span class="nv">NGRAM</span><span class="o">=</span><span class="nv">$SRILM_SRC</span>/bin/i686-m64/ngram +<span class="nv">DIRS</span><span class="o">=(</span> afp_eng apw_eng cna_eng ltw_eng nyt_eng wpb_eng xin_eng <span class="o">)</span> +<span class="nv">LAMBDAS</span><span class="o">=(</span>0.00631272 0.000647602 0.251555 0.0134726 0.348953 0.371566 0.00749238<span class="o">)</span> + +<span class="nv">$NGRAM</span> -order 5 -unk <span class="se">\</span> + -lm <span class="k">${</span><span class="nv">DIRS</span><span class="p">[0]</span><span class="k">}</span>/lm.gz -lambda <span class="k">${</span><span class="nv">LAMBDAS</span><span class="p">[0]</span><span class="k">}</span> <span class="se">\</span> + -mix-lm <span class="k">${</span><span class="nv">DIRS</span><span class="p">[1]</span><span class="k">}</span>/lm.gz <span class="se">\</span> + -mix-lm2 <span class="k">${</span><span class="nv">DIRS</span><span class="p">[2]</span><span class="k">}</span>/lm.gz -mix-lambda2 <span class="k">${</span><span class="nv">LAMBDAS</span><span class="p">[2]</span><span class="k">}</span> <span class="se">\</span> + -mix-lm3 <span class="k">${</span><span class="nv">DIRS</span><span class="p">[3]</span><span class="k">}</span>/lm.gz -mix-lambda3 <span class="k">${</span><span class="nv">LAMBDAS</span><span class="p">[3]</span><span class="k">}</span> <span class="se">\</span> + -mix-lm4 <span class="k">${</span><span class="nv">DIRS</span><span class="p">[4]</span><span class="k">}</span>/lm.gz -mix-lambda4 <span class="k">${</span><span class="nv">LAMBDAS</span><span class="p">[4]</span><span class="k">}</span> <span class="se">\</span> + -mix-lm5 <span class="k">${</span><span class="nv">DIRS</span><span class="p">[5]</span><span class="k">}</span>/lm.gz -mix-lambda5 <span class="k">${</span><span class="nv">LAMBDAS</span><span class="p">[5]</span><span class="k">}</span> <span class="se">\</span> + -mix-lm6 <span class="k">${</span><span class="nv">DIRS</span><span class="p">[6]</span><span class="k">}</span>/lm.gz -mix-lambda6 <span class="k">${</span><span class="nv">LAMBDAS</span><span class="p">[6]</span><span class="k">}</span> <span class="se">\</span> + -write-lm mixed_lm.gz +</code></pre> +</div> + +<p>The resulting file, <code class="highlighter-rouge">mixed_lm.gz</code> is a language model based on all the text in +the Gigaword corpus and with some probabilities biased to the development text +specify in step 3-1. It is in the ARPA format. The optional next step converts +it into KenLM format.</p> + +<h4 id="step-3-3-convert-to-kenlm">Step 3-3: Convert to KenLM</h4> + +<p>The KenLM format has some speed advantages over the ARPA format. Issuing the +following command will write a new language model file <code class="highlighter-rouge">mixed_lm-kenlm.gz</code> that +is the <code class="highlighter-rouge">mixed_lm.gz</code> language model transformed into the KenLM format.</p> + +<div class="highlighter-rouge"><pre class="highlight"><code>$JOSHUA/src/joshua/decoder/ff/lm/kenlm/build_binary mixed_lm.gz mixed_lm.kenlm +</code></pre> +</div> + + + + </div> + </div> + + </body> +</html> + + + + + http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/large-lms.md ---------------------------------------------------------------------- diff --git a/4.0/large-lms.md b/4.0/large-lms.md deleted file mode 100644 index a4ba5b7..0000000 --- a/4.0/large-lms.md +++ /dev/null @@ -1,192 +0,0 @@ ---- -layout: default4 -title: Building large LMs with SRILM -category: advanced ---- - -The following is a tutorial for building a large language model from the -English Gigaword Fifth Edition corpus -[LDC2011T07](http://www.ldc.upenn.edu/Catalog/catalogEntry.jsp?catalogId=LDC2011T07) -using SRILM. English text is provided from seven different sources. - -### Step 0: Clean up the corpus - -The Gigaword corpus has to be stripped of all SGML tags and tokenized. -Instructions for performing those steps are not included in this -documentation. A description of this process can be found in a paper -called ["Annotated -Gigaword"](https://akbcwekex2012.files.wordpress.com/2012/05/28_paper.pdf). - -The Joshua package ships with a script that converts all alphabetical -characters to their lowercase equivalent. The script is located at -`$JOSHUA/scripts/lowercase.perl`. - -Make a directory structure as follows: - - gigaword/ - âââ corpus/ - â  âââ afp_eng/ - â  â  âââ afp_eng_199405.lc.gz - â  â  âââ afp_eng_199406.lc.gz - â  â  âââ ... - â  â  âââ counts/ - â  âââ apw_eng/ - â  â  âââ apw_eng_199411.lc.gz - â  â  âââ apw_eng_199412.lc.gz - â  â  âââ ... - â  â  âââ counts/ - â  âââ cna_eng/ - â  â  âââ ... - â  â  âââ counts/ - â  âââ ltw_eng/ - â  â  âââ ... - â  â  âââ counts/ - â  âââ nyt_eng/ - â  â  âââ ... - â  â  âââ counts/ - â  âââ wpb_eng/ - â  â  âââ ... - â  â  âââ counts/ - â  âââ xin_eng/ - â    âââ ... - â    âââ counts/ - âââ lm/ -   âââ afp_eng/ -   âââ apw_eng/ -   âââ cna_eng/ -   âââ ltw_eng/ -   âââ nyt_eng/ -   âââ wpb_eng/ -   âââ xin_eng/ - - -The next step will be to build smaller LMs and then interpolate them into one -file. - -### Step 1: Count ngrams - -Run the following script once from each source directory under the `corpus/` -directory (edit it to specify the path to the `ngram-count` binary as well as -the number of processors): - - #!/bin/sh - - NGRAM_COUNT=$SRILM_SRC/bin/i686-m64/ngram-count - args="" - - for source in *.gz; do - args=$args"-sort -order 5 -text $source -write counts/$source-counts.gz " - done - - echo $args | xargs --max-procs=4 -n 7 $NGRAM_COUNT - -Then move each `counts/` directory to the corresponding directory under -`lm/`. Now that each ngram has been counted, we can make a language -model for each of the seven sources. - -### Step 2: Make individual language models - -SRILM includes a script, called `make-big-lm`, for building large language -models under resource-limited environments. The manual for this script can be -read online -[here](http://www-speech.sri.com/projects/srilm/manpages/training-scripts.1.html). -Since the Gigaword corpus is so large, it is convenient to use `make-big-lm` -even in environments with many parallel processors and a lot of memory. - -Initiate the following script from each of the source directories under the -`lm/` directory (edit it to specify the path to the `make-big-lm` script as -well as the pruning threshold): - - #!/bin/bash - set -x - - CMD=$SRILM_SRC/bin/make-big-lm - PRUNE_THRESHOLD=1e-8 - - $CMD \ - -name gigalm `for k in counts/*.gz; do echo " \ - -read $k "; done` \ - -lm lm.gz \ - -max-per-file 100000000 \ - -order 5 \ - -kndiscount \ - -interpolate \ - -unk \ - -prune $PRUNE_THRESHOLD - -The language model attributes chosen are the following: - -* N-grams up to order 5 -* Kneser-Ney smoothing -* N-gram probability estimates at the specified order *n* are interpolated with - lower-order estimates -* include the unknown-word token as a regular word -* pruning N-grams based on the specified threshold - -Next, we will mix the models together into a single file. - -### Step 3: Mix models together - -Using development text, interpolation weights can determined that give highest -weight to the source language models that have the lowest perplexity on the -specified development set. - -#### Step 3-1: Determine interpolation weights - -Initiate the following script from the `lm/` directory (edit it to specify the -path to the `ngram` binary as well as the path to the development text file): - - #!/bin/bash - set -x - - NGRAM=$SRILM_SRC/bin/i686-m64/ngram - DEV_TEXT=~mpost/expts/wmt12/runs/es-en/data/tune/tune.tok.lc.es - - dirs=( afp_eng apw_eng cna_eng ltw_eng nyt_eng wpb_eng xin_eng ) - - for d in ${dirs[@]} ; do - $NGRAM -debug 2 -order 5 -unk -lm $d/lm.gz -ppl $DEV_TEXT > $d/lm.ppl ; - done - - compute-best-mix */lm.ppl > best-mix.ppl - -Take a look at the contents of `best-mix.ppl`. It will contain a sequence of -values in parenthesis. These are the interpolation weights of the source -language models in the order specified. Copy and paste the values within the -parenthesis into the script below. - -#### Step 3-2: Combine the models - -Initiate the following script from the `lm/` directory (edit it to specify the -path to the `ngram` binary as well as the interpolation weights): - - #!/bin/bash - set -x - - NGRAM=$SRILM_SRC/bin/i686-m64/ngram - DIRS=( afp_eng apw_eng cna_eng ltw_eng nyt_eng wpb_eng xin_eng ) - LAMBDAS=(0.00631272 0.000647602 0.251555 0.0134726 0.348953 0.371566 0.00749238) - - $NGRAM -order 5 -unk \ - -lm ${DIRS[0]}/lm.gz -lambda ${LAMBDAS[0]} \ - -mix-lm ${DIRS[1]}/lm.gz \ - -mix-lm2 ${DIRS[2]}/lm.gz -mix-lambda2 ${LAMBDAS[2]} \ - -mix-lm3 ${DIRS[3]}/lm.gz -mix-lambda3 ${LAMBDAS[3]} \ - -mix-lm4 ${DIRS[4]}/lm.gz -mix-lambda4 ${LAMBDAS[4]} \ - -mix-lm5 ${DIRS[5]}/lm.gz -mix-lambda5 ${LAMBDAS[5]} \ - -mix-lm6 ${DIRS[6]}/lm.gz -mix-lambda6 ${LAMBDAS[6]} \ - -write-lm mixed_lm.gz - -The resulting file, `mixed_lm.gz` is a language model based on all the text in -the Gigaword corpus and with some probabilities biased to the development text -specify in step 3-1. It is in the ARPA format. The optional next step converts -it into KenLM format. - -#### Step 3-3: Convert to KenLM - -The KenLM format has some speed advantages over the ARPA format. Issuing the -following command will write a new language model file `mixed_lm-kenlm.gz` that -is the `mixed_lm.gz` language model transformed into the KenLM format. - - $JOSHUA/src/joshua/decoder/ff/lm/kenlm/build_binary mixed_lm.gz mixed_lm.kenlm - http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/lattice.html ---------------------------------------------------------------------- diff --git a/4.0/lattice.html b/4.0/lattice.html new file mode 100644 index 0000000..74c45fa --- /dev/null +++ b/4.0/lattice.html @@ -0,0 +1,267 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> + +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> + <head> + <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> + <link rel="stylesheet" type="text/css" media="screen,print" href="../joshua4.css" /> + <title>Joshua | Lattice decoding</title> + </head> + + <body> + + <div id="navbar"> + <a href="http://joshua-decoder.org/"> + <img src="../images/joshua-logo-small.png" width="130px" + alt="Joshua logo (picture of a Joshua tree)" /> + </a> + + <p class="infobox"> + <b>Stable version</b><br /> + 4.1<br/><br/> + <b>Release date</b><br /> + 2013 January + </p> + +<!-- <div class="infobox"> --> +<!-- <b>AUTO LINKS</b><br/> --> +<!-- <ul> --> +<!-- --> +<!-- <li> Advanced features</li> --> +<!-- --> +<!-- <li> Advanced features</li> --> +<!-- --> +<!-- <li> Advanced features</li> --> +<!-- --> +<!-- <li> Building a language pack</li> --> +<!-- --> +<!-- <li> Building a language pack</li> --> +<!-- --> +<!-- <li> Bundling a configuration</li> --> +<!-- --> +<!-- <li> Contributors</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Frequently Asked Questions</li> --> +<!-- --> +<!-- <li> Common problems</li> --> +<!-- --> +<!-- <li> Frequently Asked Questions</li> --> +<!-- --> +<!-- <li> Common problems</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> Fisher and CALLHOME Spanish English Speech Translation Corpus</li> --> +<!-- --> +<!-- <li> Indian Languages Parallel Corpora</li> --> +<!-- --> +<!-- <li> Joshua 4.0 User Documentation</li> --> +<!-- --> +<!-- <li> Language packs</li> --> +<!-- --> +<!-- <li> Paraphrase Packs</li> --> +<!-- --> +<!-- <li> Joshua releases</li> --> +<!-- --> +<!-- <li> Support</li> --> +<!-- --> +<!-- <li> Getting Started</li> --> +<!-- --> +<!-- <li> Welcome to Joshua</li> --> +<!-- --> +<!-- <li> Joshua documentation</li> --> +<!-- --> +<!-- <li> Joshua documentation</li> --> +<!-- --> +<!-- <li> Installation</li> --> +<!-- --> +<!-- <li> Installation</li> --> +<!-- --> +<!-- <li> Alignment with Jacana</li> --> +<!-- --> +<!-- <li> Alignment with Jacana</li> --> +<!-- --> +<!-- <li> Alignment with Jacana</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Lattice decoding</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> Quick Start</li> --> +<!-- --> +<!-- <li> Quick Start</li> --> +<!-- --> +<!-- <li> Releases</li> --> +<!-- --> +<!-- <li> Server mode</li> --> +<!-- --> +<!-- <li> Server mode</li> --> +<!-- --> +<!-- <li> Server mode</li> --> +<!-- --> +<!-- <li> Installing and running the Joshua Decoder</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Pipeline tutorial</li> --> +<!-- --> +<!-- <li> Pipeline tutorial</li> --> +<!-- --> +<!-- <li> Pipeline tutorial</li> --> +<!-- --> +<!-- <li> What's New</li> --> +<!-- --> +<!-- <li> What's New</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- </ul> --> +<!-- </div> --> + + <div class="infobox"> + + <b>Links</b><br /> + <ul> + <li> <a href="../index.html">Main</a> </li> + <li> <a href="pipeline.html">Pipeline</a> </li> + <li> <a href="step-by-step-instructions.html">Manual walkthrough</a> </li> + <li> <a href="decoder.html">Decoder</a> </li> + <li> <a href="server.html">Decoder Server</a> </li> + <li> <a href="file-formats.html">File formats</a> </li> + <li> <a href="thrax.html">Grammar Extraction</a> </li> + <li> <a href="../releases.html">Releases</a> </li> + </ul> + </div> + + <div class="infobox"> + <b>Advanced</b><br /> + <ul> +<!-- <li> <a href="packing.html">Grammar packing</a> </li> --> + <li> <a href="large-lms.html">Building large LMs</a> </li> + <li> <a href="zmert.html">Running Z-MERT</a> </li> + <li> <a href="lattice.html">Lattices</a> </li> + <li> <a href="server.html">TCP/IP server</a> </li> + <li> <a href="bundle.html">Bundled configuration</a> </li> + </ul> + </div> + + <div class="infobox"> + <b>Help</b><br /> + <ul> + <li> <a href="faq.html">Answers</a> </li> + <li> <a href="https://groups.google.com/d/forum/joshua_support">Archive</a> </li> + </ul> + </div> + + <div class="footer"> + Last updated on April 08, 2016 + </div> + + </div> + + <div id="main"> + <div id="title"> + <h1>Lattice decoding</h1> + </div> + + <div id="content"> + + <p>In addition to regular sentences, Joshua can decode weighted lattices encoded in <a href="http://www.statmt.org/moses/?n=Moses.WordLattices">the PLF +format</a>. Lattice decoding was originally added +by Lane Schwartz and <a href="http://www.cs.cmu.edu/~cdyer/">Chris Dyer</a>.</p> + +<p>Joshua will automatically detect whether the input sentence is a regular sentence +(the usual case) or a lattice. If a lattice, a feature will be activated that accumulates the cost +of different paths through the lattice. In this case, you need to ensure that a weight for this +feature is present in <a href="decoder.html">your model file</a>.</p> + +<p>The main caveats with Joshuaâs PLF lattice support are that the lattice needs to be listed on a +single line.</p> + + + </div> + </div> + + </body> +</html> + + + + + http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/lattice.md ---------------------------------------------------------------------- diff --git a/4.0/lattice.md b/4.0/lattice.md deleted file mode 100644 index 5d6bd47..0000000 --- a/4.0/lattice.md +++ /dev/null @@ -1,17 +0,0 @@ ---- -layout: default4 -category: advanced -title: Lattice decoding ---- - -In addition to regular sentences, Joshua can decode weighted lattices encoded in [the PLF -format](http://www.statmt.org/moses/?n=Moses.WordLattices). Lattice decoding was originally added -by Lane Schwartz and [Chris Dyer](http://www.cs.cmu.edu/~cdyer/). - -Joshua will automatically detect whether the input sentence is a regular sentence -(the usual case) or a lattice. If a lattice, a feature will be activated that accumulates the cost -of different paths through the lattice. In this case, you need to ensure that a weight for this -feature is present in [your model file](decoder.html). - -The main caveats with Joshua's PLF lattice support are that the lattice needs to be listed on a -single line. http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/packing.html ---------------------------------------------------------------------- diff --git a/4.0/packing.html b/4.0/packing.html new file mode 100644 index 0000000..cdfa675 --- /dev/null +++ b/4.0/packing.html @@ -0,0 +1,357 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> + +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> + <head> + <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> + <link rel="stylesheet" type="text/css" media="screen,print" href="../joshua4.css" /> + <title>Joshua | Grammar Packing</title> + </head> + + <body> + + <div id="navbar"> + <a href="http://joshua-decoder.org/"> + <img src="../images/joshua-logo-small.png" width="130px" + alt="Joshua logo (picture of a Joshua tree)" /> + </a> + + <p class="infobox"> + <b>Stable version</b><br /> + 4.1<br/><br/> + <b>Release date</b><br /> + 2013 January + </p> + +<!-- <div class="infobox"> --> +<!-- <b>AUTO LINKS</b><br/> --> +<!-- <ul> --> +<!-- --> +<!-- <li> Advanced features</li> --> +<!-- --> +<!-- <li> Advanced features</li> --> +<!-- --> +<!-- <li> Advanced features</li> --> +<!-- --> +<!-- <li> Building a language pack</li> --> +<!-- --> +<!-- <li> Building a language pack</li> --> +<!-- --> +<!-- <li> Bundling a configuration</li> --> +<!-- --> +<!-- <li> Contributors</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Decoder configuration parameters</li> --> +<!-- --> +<!-- <li> Frequently Asked Questions</li> --> +<!-- --> +<!-- <li> Common problems</li> --> +<!-- --> +<!-- <li> Frequently Asked Questions</li> --> +<!-- --> +<!-- <li> Common problems</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Features</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> Joshua file formats</li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> Fisher and CALLHOME Spanish English Speech Translation Corpus</li> --> +<!-- --> +<!-- <li> Indian Languages Parallel Corpora</li> --> +<!-- --> +<!-- <li> Joshua 4.0 User Documentation</li> --> +<!-- --> +<!-- <li> Language packs</li> --> +<!-- --> +<!-- <li> Paraphrase Packs</li> --> +<!-- --> +<!-- <li> Joshua releases</li> --> +<!-- --> +<!-- <li> Support</li> --> +<!-- --> +<!-- <li> Getting Started</li> --> +<!-- --> +<!-- <li> Welcome to Joshua</li> --> +<!-- --> +<!-- <li> Joshua documentation</li> --> +<!-- --> +<!-- <li> Joshua documentation</li> --> +<!-- --> +<!-- <li> Installation</li> --> +<!-- --> +<!-- <li> Installation</li> --> +<!-- --> +<!-- <li> Alignment with Jacana</li> --> +<!-- --> +<!-- <li> Alignment with Jacana</li> --> +<!-- --> +<!-- <li> Alignment with Jacana</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Building large LMs with SRILM</li> --> +<!-- --> +<!-- <li> Lattice decoding</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> Grammar Packing</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> The Joshua Pipeline</li> --> +<!-- --> +<!-- <li> Quick Start</li> --> +<!-- --> +<!-- <li> Quick Start</li> --> +<!-- --> +<!-- <li> Releases</li> --> +<!-- --> +<!-- <li> Server mode</li> --> +<!-- --> +<!-- <li> Server mode</li> --> +<!-- --> +<!-- <li> Server mode</li> --> +<!-- --> +<!-- <li> Installing and running the Joshua Decoder</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Grammar extraction with Thrax</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Building Translation Models</li> --> +<!-- --> +<!-- <li> Pipeline tutorial</li> --> +<!-- --> +<!-- <li> Pipeline tutorial</li> --> +<!-- --> +<!-- <li> Pipeline tutorial</li> --> +<!-- --> +<!-- <li> What's New</li> --> +<!-- --> +<!-- <li> What's New</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> Z-MERT</li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- <li> </li> --> +<!-- --> +<!-- </ul> --> +<!-- </div> --> + + <div class="infobox"> + + <b>Links</b><br /> + <ul> + <li> <a href="../index.html">Main</a> </li> + <li> <a href="pipeline.html">Pipeline</a> </li> + <li> <a href="step-by-step-instructions.html">Manual walkthrough</a> </li> + <li> <a href="decoder.html">Decoder</a> </li> + <li> <a href="server.html">Decoder Server</a> </li> + <li> <a href="file-formats.html">File formats</a> </li> + <li> <a href="thrax.html">Grammar Extraction</a> </li> + <li> <a href="../releases.html">Releases</a> </li> + </ul> + </div> + + <div class="infobox"> + <b>Advanced</b><br /> + <ul> +<!-- <li> <a href="packing.html">Grammar packing</a> </li> --> + <li> <a href="large-lms.html">Building large LMs</a> </li> + <li> <a href="zmert.html">Running Z-MERT</a> </li> + <li> <a href="lattice.html">Lattices</a> </li> + <li> <a href="server.html">TCP/IP server</a> </li> + <li> <a href="bundle.html">Bundled configuration</a> </li> + </ul> + </div> + + <div class="infobox"> + <b>Help</b><br /> + <ul> + <li> <a href="faq.html">Answers</a> </li> + <li> <a href="https://groups.google.com/d/forum/joshua_support">Archive</a> </li> + </ul> + </div> + + <div class="footer"> + Last updated on April 08, 2016 + </div> + + </div> + + <div id="main"> + <div id="title"> + <h1>Grammar Packing</h1> + </div> + + <div id="content"> + + <p>Grammar packing refers to the process of taking a textual grammar output by <a href="thrax.html">Thrax</a> and +efficiently encoding it for use by Joshua. Packing the grammar results in significantly faster load +times for very large grammars.</p> + +<p>Soon, the <a href="pipeline.html">Joshua pipeline script</a> will add support for grammar packing +automatically, and we will provide a script that automates these steps for you.</p> + +<ol> + <li> + <p>Make sure the grammar is labeled. A labeled grammar is one that has feature names attached to +each of the feature values in each row of the grammar file. Here is a line from an unlabeled +grammar:</p> + + <div class="highlighter-rouge"><pre class="highlight"><code> [X] ||| [X,1] ঠনà§à¦¯à¦¾à¦¨à§à¦¯ [X,2] ||| [X,1] other [X,2] ||| 0 0 1 0 0 1.02184 +</code></pre> + </div> + + <p>and here is one from an labeled grammar (note that the labels are not very useful):</p> + + <div class="highlighter-rouge"><pre class="highlight"><code> [X] ||| [X,1] ঠনà§à¦¯à¦¾à¦¨à§à¦¯ [X,2] ||| [X,1] other [X,2] ||| f1=0 f2=0 f3=1 f4=0 f5=0 f6=1.02184 +</code></pre> + </div> + + <p>If your grammar is not labeled, you can use the script <code class="highlighter-rouge">$JOSHUA/scripts/label_grammar.py</code>:</p> + + <div class="highlighter-rouge"><pre class="highlight"><code> zcat grammar.gz | $JOSHUA/scripts/label_grammar.py > grammar-labeled.gz +</code></pre> + </div> + + <p>As a side-effect of this step is to produce a file âdense_mapâ in the current directory, +containing the mapping between feature names and feature columns. This file is needed in later +steps.</p> + </li> + <li> + <p>The packer needs a sorted grammar. It is sufficient to sort by the first word:</p> + + <div class="highlighter-rouge"><pre class="highlight"><code> zcat grammar-labeled.gz | sort -k3,3 | gzip > grammar-sorted.gz +</code></pre> + </div> + + <p>(The reason we need a sorted grammar is because the packer stores the grammar in a trie. The +pieces canât be more than 2 GB due to Java limitations, so we need to ensure that rules are +grouped by the first arc in the trie to avoid redundancy across tries and to simplify the +lookup).</p> + </li> + <li> + <p>In order to pack the grammar, we need two pieces of information: (1) a packer configuration file, +and (2) a dense map file.</p> + + <ol> + <li> + <p>Write a packer config file. This file specifies items such as the chunk size (for the packed +pieces) and the quantization classes and types for each feature name. Examples can be found +at</p> + + <div class="highlighter-rouge"><pre class="highlight"><code> $JOSHUA/test/packed/packer.config + $JOSHUA/test/bn-en/packed/packer.quantized + $JOSHUA/test/bn-en/packed/packer.uncompressed +</code></pre> + </div> + + <p>The quantizer lines in the packer config file have the following format:</p> + + <div class="highlighter-rouge"><pre class="highlight"><code> quantizer TYPE FEATURES +</code></pre> + </div> + + <p>where <code class="highlighter-rouge">TYPE</code> is one of <code class="highlighter-rouge">boolean</code>, <code class="highlighter-rouge">float</code>, <code class="highlighter-rouge">byte</code>, or <code class="highlighter-rouge">8bit</code>, and <code class="highlighter-rouge">FEATURES</code> is a + space-delimited list of feature names that have that quantization type.</p> + </li> + <li> + <p>Write a dense_map file. If you labeled an unlabeled grammar, this was produced for you as a +side product of the <code class="highlighter-rouge">label_grammar.py</code> script you called in Step 1. Otherwise, you need to +create a file that lists the mapping between feature names and (0-indexed) columns in the +grammar, one per line, in the following format:</p> + + <div class="highlighter-rouge"><pre class="highlight"><code> feature-index feature-name +</code></pre> + </div> + </li> + </ol> + </li> + <li> + <p>To pack the grammar, type the following command:</p> + + <div class="highlighter-rouge"><pre class="highlight"><code> java -cp $JOSHUA/bin joshua.tools.GrammarPacker -c PACKER_CONFIG_FILE -p OUTPUT_DIR -g GRAMMAR_FILE +</code></pre> + </div> + + <p>This will read in your packer configuration file and your grammar, and produced a packed grammar + in the output directory.</p> + </li> + <li> + <p>To use the packed grammar, just point to the packed directory in your Joshua configuration file.</p> + + <div class="highlighter-rouge"><pre class="highlight"><code> tm-file = packed-grammar/ + tm-format = packed +</code></pre> + </div> + </li> +</ol> + + + </div> + </div> + + </body> +</html> + + + + +
