http://git-wip-us.apache.org/repos/asf/madlib-site/blob/af0e5f14/docs/v1.15.1/group__grp__crf.html ---------------------------------------------------------------------- diff --git a/docs/v1.15.1/group__grp__crf.html b/docs/v1.15.1/group__grp__crf.html new file mode 100644 index 0000000..d0c215d --- /dev/null +++ b/docs/v1.15.1/group__grp__crf.html @@ -0,0 +1,632 @@ +<!-- HTML header for doxygen 1.8.4--> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> +<html xmlns="http://www.w3.org/1999/xhtml"> +<head> +<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/> +<meta http-equiv="X-UA-Compatible" content="IE=9"/> +<meta name="generator" content="Doxygen 1.8.14"/> +<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/> +<title>MADlib: Conditional Random Field</title> +<link href="tabs.css" rel="stylesheet" type="text/css"/> +<script type="text/javascript" src="jquery.js"></script> +<script type="text/javascript" src="dynsections.js"></script> +<link href="navtree.css" rel="stylesheet" type="text/css"/> +<script type="text/javascript" src="resize.js"></script> +<script type="text/javascript" src="navtreedata.js"></script> +<script type="text/javascript" src="navtree.js"></script> +<script type="text/javascript"> +/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&dn=gpl-2.0.txt GPL-v2 */ + $(document).ready(initResizable); +/* @license-end */</script> +<link href="search/search.css" rel="stylesheet" type="text/css"/> +<script type="text/javascript" src="search/searchdata.js"></script> +<script type="text/javascript" src="search/search.js"></script> +<script type="text/javascript"> +/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&dn=gpl-2.0.txt GPL-v2 */ + $(document).ready(function() { init_search(); }); +/* @license-end */ +</script> +<script type="text/x-mathjax-config"> + MathJax.Hub.Config({ + extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"], + jax: ["input/TeX","output/HTML-CSS"], +}); +</script><script type="text/javascript" async src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/MathJax.js"></script> +<!-- hack in the navigation tree --> +<script type="text/javascript" src="eigen_navtree_hacks.js"></script> +<link href="doxygen.css" rel="stylesheet" type="text/css" /> +<link href="madlib_extra.css" rel="stylesheet" type="text/css"/> +<!-- google analytics --> +<script> + (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ + (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), + m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) + })(window,document,'script','//www.google-analytics.com/analytics.js','ga'); + ga('create', 'UA-45382226-1', 'madlib.apache.org'); + ga('send', 'pageview'); +</script> +</head> +<body> +<div id="top"><!-- do not remove this div, it is closed by doxygen! --> +<div id="titlearea"> +<table cellspacing="0" cellpadding="0"> + <tbody> + <tr style="height: 56px;"> + <td id="projectlogo"><a href="http://madlib.apache.org"><img alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td> + <td style="padding-left: 0.5em;"> + <div id="projectname"> + <span id="projectnumber">1.15.1</span> + </div> + <div id="projectbrief">User Documentation for Apache MADlib</div> + </td> + <td> <div id="MSearchBox" class="MSearchBoxInactive"> + <span class="left"> + <img id="MSearchSelect" src="search/mag_sel.png" + onmouseover="return searchBox.OnSearchSelectShow()" + onmouseout="return searchBox.OnSearchSelectHide()" + alt=""/> + <input type="text" id="MSearchField" value="Search" accesskey="S" + onfocus="searchBox.OnSearchFieldFocus(true)" + onblur="searchBox.OnSearchFieldFocus(false)" + onkeyup="searchBox.OnSearchFieldChange(event)"/> + </span><span class="right"> + <a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a> + </span> + </div> +</td> + </tr> + </tbody> +</table> +</div> +<!-- end header part --> +<!-- Generated by Doxygen 1.8.14 --> +<script type="text/javascript"> +/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&dn=gpl-2.0.txt GPL-v2 */ +var searchBox = new SearchBox("searchBox", "search",false,'Search'); +/* @license-end */ +</script> +</div><!-- top --> +<div id="side-nav" class="ui-resizable side-nav-resizable"> + <div id="nav-tree"> + <div id="nav-tree-contents"> + <div id="nav-sync" class="sync"></div> + </div> + </div> + <div id="splitbar" style="-moz-user-select:none;" + class="ui-resizable-handle"> + </div> +</div> +<script type="text/javascript"> +/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&dn=gpl-2.0.txt GPL-v2 */ +$(document).ready(function(){initNavTree('group__grp__crf.html','');}); +/* @license-end */ +</script> +<div id="doc-content"> +<!-- window showing the filter options --> +<div id="MSearchSelectWindow" + onmouseover="return searchBox.OnSearchSelectShow()" + onmouseout="return searchBox.OnSearchSelectHide()" + onkeydown="return searchBox.OnSearchSelectKey(event)"> +</div> + +<!-- iframe showing the search results (closed by default) --> +<div id="MSearchResultsWindow"> +<iframe src="javascript:void(0)" frameborder="0" + name="MSearchResults" id="MSearchResults"> +</iframe> +</div> + +<div class="header"> + <div class="headertitle"> +<div class="title">Conditional Random Field<div class="ingroups"><a class="el" href="group__grp__super.html">Supervised Learning</a></div></div> </div> +</div><!--header--> +<div class="contents"> +<div class="toc"><b>Contents</b> <ul> +<li> +<a href="#train_feature">Training Feature Generation</a> </li> +<li> +<a href="#train">CRF Training Function</a> </li> +<li> +<a href="#test_feature">Testing Feature Generation</a> </li> +<li> +<a href="#inference">Inference using Viterbi</a> </li> +<li> +<a href="#usage">Using CRF</a> </li> +<li> +<a href="#examples">Examples</a> </li> +<li> +<a href="#background">Technical Background</a> </li> +<li> +<a href="#literature">Literature</a> </li> +<li> +<a href="#related">Related Topics</a> </li> +</ul> +</div><p>A conditional random field (CRF) is a type of discriminative, undirected probabilistic graphical model. A linear-chain CRF is a special type of CRF that assumes the current state depends only on the previous state.</p> +<p>Feature extraction modules are provided for text-analysis tasks such as part-of-speech (POS) tagging and named-entity resolution (NER). Currently, six feature types are implemented:</p> +<ul> +<li>Edge Feature: transition feature that encodes the transition feature weight from current label to next label.</li> +<li>Start Feature: fired when the current token is the first token in a sequence.</li> +<li>End Feature: fired when the current token is the last token in a sequence.</li> +<li>Word Feature: fired when the current token is observed in the trained dictionary.</li> +<li>Unknown Feature: fired when the current token is not observed in the trained dictionary for at least a certain number of times (default 1).</li> +<li>Regex Feature: fired when the current token can be matched by a regular expression.</li> +</ul> +<p>A Viterbi implementation is also provided to get the best label sequence and the conditional probability \( \Pr( \text{best label sequence} \mid \text{sequence}) \).</p> +<p>Following steps are required for CRF Learning and Inference:</p><ol type="1"> +<li><a href="#train_feature">Training Feature Generation</a></li> +<li><a href="#train">CRF Training</a></li> +<li><a href="#test_feature">Testing Feature Generation</a></li> +<li><a href="#inference">Inference using Viterbi</a></li> +</ol> +<p><a class="anchor" id="train_feature"></a></p><dl class="section user"><dt>Training Feature Generation</dt><dd>The function takes <code>train_segment_tbl</code> and <code>regex_tbl</code> as input and does feature generation generating three tables <code>dictionary_tbl</code>, <code>train_feature_tbl</code> and <code>train_featureset_tbl</code>, that are required as an input for CRF training. <pre class="syntax"> +crf_train_fgen(train_segment_tbl, + regex_tbl, + label_tbl, + dictionary_tbl, + train_feature_tbl, + train_featureset_tbl) +</pre> <b>Arguments</b> <dl class="arglist"> +<dt>train_segment_tbl </dt> +<dd>TEXT. Name of the training segment table. The table is expected to have the following columns: <table class="output"> +<tr> +<th>doc_id </th><td>INTEGER. Document id column </td></tr> +<tr> +<th>start_pos </th><td>INTEGER. Index of a particular term in the respective document </td></tr> +<tr> +<th>seg_text </th><td>TEXT. Term at the respective <code>start_pos</code> in the document </td></tr> +<tr> +<th>label </th><td>INTEGER. Label id for the term corresponding to the actual label from <code>label_tbl</code> </td></tr> +</table> +</dd> +<dt>regex_tbl </dt> +<dd>TEXT. Name of the regular expression table. The table is expected to have the following columns: <table class="output"> +<tr> +<th>pattern </th><td>TEXT. Regular Expression </td></tr> +<tr> +<th>name </th><td>TEXT. Regular Expression name </td></tr> +</table> +</dd> +<dt>label_tbl </dt> +<dd>TEXT. Name of the table containing unique labels and their id's. The table is expected to have the following columns: <table class="output"> +<tr> +<th>id </th><td>INTEGER. Unique label id. NOTE: Must range from 0 to total number of labels in the table - 1. </td></tr> +<tr> +<th>label </th><td>TEXT. Label name </td></tr> +</table> +</dd> +<dt>dictionary_tbl </dt> +<dd>TEXT. Name of the dictionary table to be created containing unique terms along with their counts. The table will have the following columns: <table class="output"> +<tr> +<th>token </th><td>TEXT. Contains all the unique terms found in <code>train_segment_tbl</code> </td></tr> +<tr> +<th>total </th><td>INTEGER. Respective counts for the terms </td></tr> +</table> +</dd> +<dt>train_feature_tbl</dt> +<dd></dd> +<dt></dt> +<dd><p class="startdd">TEXT. Name of the training feature table to be created. The table will have the following columns: </p><table class="output"> +<tr> +<th>doc_id </th><td>INTEGER. Document id </td></tr> +<tr> +<th>f_size </th><td>INTEGER. Feature set size. This value will be same for all the tuples in the table </td></tr> +<tr> +<th>sparse_r </th><td>DOUBLE PRECISION[]. Array union of individual single state features (previous label, label, feature index, start position, training existance indicator), ordered by their start position. </td></tr> +<tr> +<th>dense_m </th><td>DOUBLE PRECISION[]. Array union of (previous label, label, feature index, start position, training existance indicator) of edge features ordered by start position. </td></tr> +<tr> +<th>sparse_m </th><td>DOUBLE PRECISION[]. Array union of (feature index, previous label, label) of edge features ordered by feature index. </td></tr> +</table> +<p class="enddd"></p> +</dd> +<dt>train_featureset_tbl </dt> +<dd>TEXT. Name of the table to be created containing distinct featuresets generated from training feature extraction. The table will have the following columns: <table class="output"> +<tr> +<th>f_index </th><td>INTEGER. Column containing distinct featureset ids </td></tr> +<tr> +<th>f_name </th><td>TEXT. Feature name </td></tr> +<tr> +<th>feature </th><td>ARRAY. Feature value. The value is of the form [L1, L2] <br /> + - If L1 = -1: represents single state feature with L2 being the current label id. <br /> + - If L1 != -1: represents transition feature with L1 be the previous label and L2 be the current label. </td></tr> +</table> +</dd> +</dl> +</dd></dl> +<p><a class="anchor" id="train"></a></p><dl class="section user"><dt>Linear Chain CRF Training Function</dt><dd>The function takes <code>train_feature_tbl</code> and <code>train_featureset_tbl</code> tables generated in the training feature generation steps as input along with other required parameters and produces two output tables <code>crf_stats_tbl</code> and <code>crf_weights_tbl</code>.</dd></dl> +<pre class="syntax"> +lincrf_train(train_feature_tbl, + train_featureset_tbl, + label_tbl, + crf_stats_tbl, + crf_weights_tbl + max_iterations + ) +</pre><p> <b>Arguments</b> </p><dl class="arglist"> +<dt>train_feature_tbl </dt> +<dd><p class="startdd">TEXT. Name of the feature table generated during training feature generation</p> +<p class="enddd"></p> +</dd> +<dt>train_featureset_tbl </dt> +<dd><p class="startdd">TEXT. Name of the featureset table generated during training feature generation</p> +<p class="enddd"></p> +</dd> +<dt>label_tbl </dt> +<dd><p class="startdd">TEXT. Name of the label table used</p> +<p class="enddd"></p> +</dd> +<dt>crf_stats_table </dt> +<dd>TEXT. Name of the table to be created containing statistics for CRF training. The table has the following columns: <table class="output"> +<tr> +<th>coef </th><td>DOUBLE PRECISION[]. Array of coefficients </td></tr> +<tr> +<th>log_likelihood </th><td>DOUBLE. Log-likelihood </td></tr> +<tr> +<th>num_iterations </th><td>INTEGER. The number of iterations at which the algorithm terminated </td></tr> +</table> +</dd> +<dt>crf_weights_table </dt> +<dd><p class="startdd">TEXT. Name of the table to be created creating learned feature weights. The table has the following columns: </p><table class="output"> +<tr> +<th>id </th><td>INTEGER. Feature set id </td></tr> +<tr> +<th>name </th><td>TEXT. Feature name </td></tr> +<tr> +<th>prev_label_id </th><td>INTEGER. Label for the previous token encountered </td></tr> +<tr> +<th>label_id </th><td>INTEGER. Label of the token with the respective feature </td></tr> +<tr> +<th>weight </th><td>DOUBLE PRECISION. Weight for the respective feature set </td></tr> +</table> +<p class="enddd"></p> +</dd> +<dt>max_iterations </dt> +<dd>INTEGER. The maximum number of iterations </dd> +</dl> +<p><a class="anchor" id="test_feature"></a></p><dl class="section user"><dt>Testing Feature Generation</dt><dd></dd></dl> +<pre class="syntax"> +crf_test_fgen(test_segment_tbl, + dictionary_tbl, + label_tbl, + regex_tbl, + crf_weights_tbl, + viterbi_mtbl, + viterbi_rtbl + ) +</pre><p> <b>Arguments</b> </p><dl class="arglist"> +<dt>test_segment_tbl </dt> +<dd><p class="startdd">TEXT. Name of the testing segment table. The table is expected to have the following columns: </p><table class="output"> +<tr> +<th>doc_id </th><td>INTEGER. Document id column </td></tr> +<tr> +<th>start_pos </th><td>INTEGER. Index of a particular term in the respective document </td></tr> +<tr> +<th>seg_text </th><td>TEXT. Term at the respective <code>start_pos</code> in the document </td></tr> +</table> +<p class="enddd"></p> +</dd> +<dt>dictionary_tbl </dt> +<dd><p class="startdd">TEXT. Name of the dictionary table created during training feature generation (<code>crf_train_fgen</code>)</p> +<p class="enddd"></p> +</dd> +<dt>label_tbl </dt> +<dd><p class="startdd">TEXT. Name of the label table</p> +<p class="enddd"></p> +</dd> +<dt>regex_tbl </dt> +<dd><p class="startdd">TEXT. Name of the regular expression table</p> +<p class="enddd"></p> +</dd> +<dt>crf_weights_tbl </dt> +<dd><p class="startdd">TEXT. Name of the weights table generated during CRF training (<code>lincrf_train</code>)</p> +<p class="enddd"></p> +</dd> +<dt>viterbi_mtbl </dt> +<dd><p class="startdd">TEXT. Name of the Viterbi M table to be created</p> +<p class="enddd"></p> +</dd> +<dt>viterbi_rtbl </dt> +<dd>TEXT. Name of the Viterbi R table to be created </dd> +</dl> +<p><a class="anchor" id="inference"></a></p><dl class="section user"><dt>Inference using Viterbi</dt><dd><pre class="syntax"> +vcrf_label(test_segment_tbl, + viterbi_mtbl, + viterbi_rtbl, + label_tbl, + result_tbl) +</pre> <b>Arguments</b> <dl class="arglist"> +<dt>test_segment_tbl </dt> +<dd>TEXT. Name of the testing segment table. For required table schema, please refer to arguments in previous section </dd> +<dt>viterbi_mtbl </dt> +<dd>TEXT. Name of the table <code>viterbi_mtbl</code> generated from testing feature generation <code>crf_test_fgen</code>. </dd> +<dt>viterbi_rtbl </dt> +<dd>TEXT. Name of the table <code>viterbi_rtbl</code> generated from testing feature generation <code>crf_test_fgen</code>. </dd> +<dt>label_tbl </dt> +<dd>TEXT. Name of the label table. </dd> +<dt>result_tbl </dt> +<dd>TEXT. Name of the result table to be created containing extracted best label sequences. </dd> +</dl> +</dd></dl> +<p><a class="anchor" id="usage"></a></p><dl class="section user"><dt>Using CRF</dt><dd></dd></dl> +<p>Generate text features, calculate their weights, and output the best label sequence for test data:<br /> +</p><ol type="1"> +<li>Perform feature generation on training data i.e. <code>train_segment_tbl</code> generating <code>train_feature_tbl</code> and <code>train_featureset_tbl</code>. <pre>SELECT madlib.crf_train_fgen( + '<em>train_segment_tbl</em>', + '<em>regex_tbl</em>', + '<em>label_tbl</em>', + '<em>dictionary_tbl</em>', + '<em>train_feature_tbl</em>', + '<em>train_featureset_tbl</em>');</pre></li> +<li>Use linear-chain CRF for training providing <code>train_feature_tbl</code> and <code>train_featureset_tbl</code> generated from previous step as an input. <pre>SELECT madlib.lincrf_train( + '<em>train_feature_tbl</em>', + '<em>train_featureset_tbl</em>', + '<em>label_tbl</em>', + '<em>crf_stats_tbl</em>', + '<em>crf_weights_tbl</em>', + <em>max_iterations</em>);</pre></li> +<li>Perform feature generation on testing data <code>test_segment_tbl</code> generating <code>viterbi_mtbl</code> and <code>viterbi_rtbl</code> required for inferencing. <pre>SELECT madlib.crf_test_fgen( + '<em>test_segment_tbl</em>', + '<em>dictionary_tbl</em>', + '<em>label_tbl</em>', + '<em>regex_tbl</em>', + '<em>crf_weights_tbl</em>', + '<em>viterbi_mtbl</em>', + '<em>viterbi_rtbl</em>');</pre></li> +<li>Run the Viterbi function to get the best label sequence and the conditional probability \( \Pr( \text{best label sequence} \mid \text{sequence}) \). <pre>SELECT madlib.vcrf_label( + '<em>test_segment_tbl</em>', + '<em>viterbi_mtbl</em>', + '<em>viterbi_rtbl</em>', + '<em>label_tbl</em>', + '<em>result_tbl</em>');</pre></li> +</ol> +<p><a class="anchor" id="examples"></a></p><dl class="section user"><dt>Examples</dt><dd>This example uses a trivial training and test data set.</dd></dl> +<ol type="1"> +<li>Load the label table, the regular expressions table, and the training segment table: <pre class="example"> +SELECT * FROM crf_label ORDER BY id; +</pre> Result: <pre class="result"> + id | label + ---+------- + 0 | # + 1 | $ + 2 | '' +... + 8 | CC + 9 | CD + 10 | DT + 11 | EX + 12 | FW + 13 | IN + 14 | JJ +... +</pre> The regular expressions table: <pre class="example"> +SELECT * from crf_regex; +</pre> <pre class="result"> + pattern | name + --------------+---------------------- + ^.+ing$ | endsWithIng + ^[A-Z][a-z]+$ | InitCapital + ^[A-Z]+$ | isAllCapital + ^.*[0-9]+.*$ | containsDigit +... +</pre> The training segment table: <pre class="example"> +SELECT * from train_segmenttbl ORDER BY doc_id, start_pos; +</pre> <pre class="result"> + doc_id | start_pos | seg_text | label + -------+-----------+------------+------- + 0 | 0 | Confidence | 18 + 0 | 1 | in | 13 + 0 | 2 | the | 10 + 0 | 3 | pound | 18 + 0 | 4 | is | 38 + 0 | 5 | widely | 26 +... + 1 | 0 | Chancellor | 19 + 1 | 1 | of | 13 + 1 | 2 | the | 10 + 1 | 3 | Exchequer | 19 + 1 | 4 | Nigel | 19 +... +</pre></li> +<li>Generate the training features: <pre class="example"> +SELECT crf_train_fgen( 'train_segmenttbl', + 'crf_regex', + 'crf_label', + 'crf_dictionary', + 'train_featuretbl', + 'train_featureset' + ); +SELECT * from crf_dictionary; +</pre> Result: <pre class="result"> + token | total + ----------------+------- + Hawthorne | 1 + Mercedes-Benzes | 1 + Wolf | 3 + best-known | 1 + hairline | 1 + accepting | 2 + purchases | 14 + trash | 5 + co-venture | 1 + restaurants | 7 +... +</pre> <pre class="example"> +SELECT * from train_featuretbl; +</pre> Result: <pre class="result"> + doc_id | f_size | sparse_r | dense_m | sparse_m + -------+--------+-------------------------------+---------------------------------+----------------------- + 2 | 87 | {-1,13,12,0,1,-1,13,9,0,1,..} | {13,31,79,1,1,31,29,70,2,1,...} | {51,26,2,69,29,17,...} + 1 | 87 | {-1,13,0,0,1,-1,13,9,0,1,...} | {13,0,62,1,1,0,13,54,2,1,13,..} | {51,26,2,69,29,17,...} +</pre> <pre class="example"> +SELECT * from train_featureset; +</pre> <pre class="result"> + f_index | f_name | feature + --------+---------------+--------- + 1 | R_endsWithED | {-1,29} + 13 | W_outweigh | {-1,26} + 29 | U | {-1,5} + 31 | U | {-1,29} + 33 | U | {-1,12} + 35 | W_a | {-1,2} + 37 | W_possible | {-1,6} + 15 | W_signaled | {-1,29} + 17 | End. | {-1,43} + 49 | W_'s | {-1,16} + 63 | W_acquire | {-1,26} + 51 | E. | {26,2} + 69 | E. | {29,17} + 71 | E. | {2,11} + 83 | W_the | {-1,2} + 85 | E. | {16,11} + 4 | W_return | {-1,11} +... +</pre></li> +<li>Train using linear CRF: <pre class="example"> +SELECT lincrf_train( 'train_featuretbl', + 'train_featureset', + 'crf_label', + 'crf_stats_tbl', + 'crf_weights_tbl', + 20 + ); +</pre> <pre class="result"> + lincrf_train + ----------------------------------------------------------------------------------- + CRF Train successful. Results stored in the specified CRF stats and weights table + lincrf +</pre> View the feature weight table. <pre class="example"> +SELECT * from crf_weights_tbl; +</pre> Result: <pre class="result"> + id | name | prev_label_id | label_id | weight + ---+---------------+---------------+----------+------------------- + 1 | R_endsWithED | -1 | 29 | 1.54128249293937 + 13 | W_outweigh | -1 | 26 | 1.70691232223653 + 29 | U | -1 | 5 | 1.40708515869008 + 31 | U | -1 | 29 | 0.830356200936407 + 33 | U | -1 | 12 | 0.769587378281239 + 35 | W_a | -1 | 2 | 2.68470625883726 + 37 | W_possible | -1 | 6 | 3.41773107604468 + 15 | W_signaled | -1 | 29 | 1.68187039165771 + 17 | End. | -1 | 43 | 3.07687845517082 + 49 | W_'s | -1 | 16 | 2.61430312229883 + 63 | W_acquire | -1 | 26 | 1.67247047385797 + 51 | E. | 26 | 2 | 3.0114240119435 + 69 | E. | 29 | 17 | 2.82385531733866 + 71 | E. | 2 | 11 | 3.00970493772732 + 83 | W_the | -1 | 2 | 2.58742315259326 +... +</pre></li> +<li>To find the best labels for a test set using the trained linear CRF model, repeat steps #1-2 and generate the test features, except instead of creating a new dictionary, use the dictionary generated from the training set. <pre class="example"> +SELECT * from test_segmenttbl ORDER BY doc_id, start_pos; +</pre> Result: <pre class="result"> + doc_id | start_pos | seg_text + -------+-----------+--------------- + 0 | 0 | Rockwell + 0 | 1 | International + 0 | 2 | Corp. + 0 | 3 | 's + 0 | 4 | Tulsa + 0 | 5 | unit + 0 | 6 | said +... + 1 | 0 | Rockwell + 1 | 1 | said + 1 | 2 | the + 1 | 3 | agreement + 1 | 4 | calls +... +</pre> <pre class="example"> +SELECT crf_test_fgen( 'test_segmenttbl', + 'crf_dictionary', + 'crf_label', + 'crf_regex', + 'crf_weights_tbl', + 'viterbi_mtbl', + 'viterbi_rtbl' + ); +</pre></li> +<li>Calculate the best label sequence and save in the table <code>extracted_best_labels</code>. <pre class="example"> +SELECT vcrf_label( 'test_segmenttbl', + 'viterbi_mtbl', + 'viterbi_rtbl', + 'crf_label', + 'extracted_best_labels' + ); +</pre> View the best labels. <pre class="example"> +SELECT * FROM extracted_best_labels; +</pre> Result: <pre class="result"> + doc_id | start_pos | seg_text | label | id | max_pos | prob + -------+-----------+---------------+-------+----+---------+---------- + 0 | 0 | Rockwell | NNP | 19 | 27 | 0.000269 + 0 | 1 | International | NNP | 19 | 27 | 0.000269 + 0 | 2 | Corp. | NNP | 19 | 27 | 0.000269 + 0 | 3 | 's | NNP | 19 | 27 | 0.000269 +... + 1 | 0 | Rockwell | NNP | 19 | 16 | 0.000168 + 1 | 1 | said | NNP | 19 | 16 | 0.000168 + 1 | 2 | the | DT | 10 | 16 | 0.000168 + 1 | 3 | agreement | JJ | 14 | 16 | 0.000168 + 1 | 4 | calls | NNS | 21 | 16 | 0.000168 +... +</pre></li> +</ol> +<p><a class="anchor" id="background"></a></p><dl class="section user"><dt>Technical Background</dt><dd></dd></dl> +<p>Specifically, a linear-chain CRF is a distribution defined by </p><p class="formulaDsp"> +\[ p_\lambda(\boldsymbol y | \boldsymbol x) = \frac{\exp{\sum_{m=1}^M \lambda_m F_m(\boldsymbol x, \boldsymbol y)}}{Z_\lambda(\boldsymbol x)} \,. \] +</p> +<p>where</p><ul> +<li>\( F_m(\boldsymbol x, \boldsymbol y) = \sum_{i=1}^n f_m(y_i,y_{i-1},x_i) \) is a global feature function that is a sum along a sequence \( \boldsymbol x \) of length \( n \)</li> +<li>\( f_m(y_i,y_{i-1},x_i) \) is a local feature function dependent on the current token label \( y_i \), the previous token label \( y_{i-1} \), and the observation \( x_i \)</li> +<li>\( \lambda_m \) is the corresponding feature weight</li> +<li>\( Z_\lambda(\boldsymbol x) \) is an instance-specific normalizer <p class="formulaDsp"> +\[ Z_\lambda(\boldsymbol x) = \sum_{\boldsymbol y'} \exp{\sum_{m=1}^M \lambda_m F_m(\boldsymbol x, \boldsymbol y')} \] +</p> +</li> +</ul> +<p>A linear-chain CRF estimates the weights \( \lambda_m \) by maximizing the log-likelihood of a given training set \( T=\{(x_k,y_k)\}_{k=1}^N \).</p> +<p>The log-likelihood is defined as </p><p class="formulaDsp"> +\[ \ell_{\lambda}=\sum_k \log p_\lambda(y_k|x_k) =\sum_k[\sum_{m=1}^M \lambda_m F_m(x_k,y_k) - \log Z_\lambda(x_k)] \] +</p> +<p>and the zero of its gradient </p><p class="formulaDsp"> +\[ \nabla \ell_{\lambda}=\sum_k[F(x_k,y_k)-E_{p_\lambda(Y|x_k)}[F(x_k,Y)]] \] +</p> +<p>is found since the maximum likelihood is reached when the empirical average of the global feature vector equals its model expectation. The MADlib implementation uses limited-memory BFGS (L-BFGS), a limited-memory variation of the BroydenâFletcherâGoldfarbâShanno (BFGS) update, a quasi-Newton method for unconstrained optimization.</p> +<p>\(E_{p_\lambda(Y|x)}[F(x,Y)]\) is found by using a variant of the forward-backward algorithm: </p><p class="formulaDsp"> +\[ E_{p_\lambda(Y|x)}[F(x,Y)] = \sum_y p_\lambda(y|x)F(x,y) = \sum_i\frac{\alpha_{i-1}(f_i*M_i)\beta_i^T}{Z_\lambda(x)} \] +</p> + <p class="formulaDsp"> +\[ Z_\lambda(x) = \alpha_n.1^T \] +</p> +<p> where \(\alpha_i\) and \( \beta_i\) are the forward and backward state cost vectors defined by </p><p class="formulaDsp"> +\[ \alpha_i = \begin{cases} \alpha_{i-1}M_i, & 0<i<=n\\ 1, & i=0 \end{cases}\\ \] +</p> + <p class="formulaDsp"> +\[ \beta_i^T = \begin{cases} M_{i+1}\beta_{i+1}^T, & 1<=i<n\\ 1, & i=n \end{cases} \] +</p> +<p>To avoid overfitting, we penalize the likelihood with a spherical Gaussian weight prior: </p><p class="formulaDsp"> +\[ \ell_{\lambda}^\prime=\sum_k[\sum_{m=1}^M \lambda_m F_m(x_k,y_k) - \log Z_\lambda(x_k)] - \frac{\lVert \lambda \rVert^2}{2\sigma ^2} \] +</p> +<p class="formulaDsp"> +\[ \nabla \ell_{\lambda}^\prime=\sum_k[F(x_k,y_k) - E_{p_\lambda(Y|x_k)}[F(x_k,Y)]] - \frac{\lambda}{\sigma ^2} \] +</p> +<dl class="section user"><dt>Literature</dt><dd>[1] F. Sha, F. Pereira. Shallow Parsing with Conditional Random Fields, <a href="http://www-bcf.usc.edu/~feisha/pubs/shallow03.pdf">http://www-bcf.usc.edu/~feisha/pubs/shallow03.pdf</a></dd></dl> +<p>[2] Wikipedia, Conditional Random Field, <a href="http://en.wikipedia.org/wiki/Conditional_random_field">http://en.wikipedia.org/wiki/Conditional_random_field</a></p> +<p>[3] A. Jaiswal, S.Tawari, I. Mansuri, K. Mittal, C. Tiwari (2012), CRF, <a href="http://crf.sourceforge.net/">http://crf.sourceforge.net/</a></p> +<p>[4] D. Wang, ViterbiCRF, <a href="http://www.cs.berkeley.edu/~daisyw/ViterbiCRF.html">http://www.cs.berkeley.edu/~daisyw/ViterbiCRF.html</a></p> +<p>[5] Wikipedia, Viterbi Algorithm, <a href="http://en.wikipedia.org/wiki/Viterbi_algorithm">http://en.wikipedia.org/wiki/Viterbi_algorithm</a></p> +<p>[6] J. Nocedal. Updating Quasi-Newton Matrices with Limited Storage (1980), Mathematics of Computation 35, pp. 773-782</p> +<p>[7] J. Nocedal, Software for Large-scale Unconstrained Optimization, <a href="http://users.eecs.northwestern.edu/~nocedal/lbfgs.html">http://users.eecs.northwestern.edu/~nocedal/lbfgs.html</a></p> +<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related Topics</dt><dd></dd></dl> +<p>File <a class="el" href="crf_8sql__in.html" title="SQL functions for conditional random field. ">crf.sql_in</a> <a class="el" href="crf__feature__gen_8sql__in.html" title="SQL function for POS/NER feature extraction. ">crf_feature_gen.sql_in</a> <a class="el" href="viterbi_8sql__in.html" title="concatenate a set of input values into arrays to feed into viterbi c function and create a human read...">viterbi.sql_in</a> (documenting the SQL functions) </p> +</div><!-- contents --> +</div><!-- doc-content --> +<!-- start footer part --> +<div id="nav-path" class="navpath"><!-- id is needed for treeview function! --> + <ul> + <li class="footer">Generated on Mon Oct 15 2018 11:24:30 for MADlib by + <a href="http://www.doxygen.org/index.html"> + <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.14 </li> + </ul> +</div> +</body> +</html>
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/af0e5f14/docs/v1.15.1/group__grp__datatrans.html ---------------------------------------------------------------------- diff --git a/docs/v1.15.1/group__grp__datatrans.html b/docs/v1.15.1/group__grp__datatrans.html new file mode 100644 index 0000000..0b7dede --- /dev/null +++ b/docs/v1.15.1/group__grp__datatrans.html @@ -0,0 +1,161 @@ +<!-- HTML header for doxygen 1.8.4--> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> +<html xmlns="http://www.w3.org/1999/xhtml"> +<head> +<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/> +<meta http-equiv="X-UA-Compatible" content="IE=9"/> +<meta name="generator" content="Doxygen 1.8.14"/> +<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/> +<title>MADlib: Data Types and Transformations</title> +<link href="tabs.css" rel="stylesheet" type="text/css"/> +<script type="text/javascript" src="jquery.js"></script> +<script type="text/javascript" src="dynsections.js"></script> +<link href="navtree.css" rel="stylesheet" type="text/css"/> +<script type="text/javascript" src="resize.js"></script> +<script type="text/javascript" src="navtreedata.js"></script> +<script type="text/javascript" src="navtree.js"></script> +<script type="text/javascript"> +/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&dn=gpl-2.0.txt GPL-v2 */ + $(document).ready(initResizable); +/* @license-end */</script> +<link href="search/search.css" rel="stylesheet" type="text/css"/> +<script type="text/javascript" src="search/searchdata.js"></script> +<script type="text/javascript" src="search/search.js"></script> +<script type="text/javascript"> +/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&dn=gpl-2.0.txt GPL-v2 */ + $(document).ready(function() { init_search(); }); +/* @license-end */ +</script> +<script type="text/x-mathjax-config"> + MathJax.Hub.Config({ + extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"], + jax: ["input/TeX","output/HTML-CSS"], +}); +</script><script type="text/javascript" async src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/MathJax.js"></script> +<!-- hack in the navigation tree --> +<script type="text/javascript" src="eigen_navtree_hacks.js"></script> +<link href="doxygen.css" rel="stylesheet" type="text/css" /> +<link href="madlib_extra.css" rel="stylesheet" type="text/css"/> +<!-- google analytics --> +<script> + (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ + (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), + m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) + })(window,document,'script','//www.google-analytics.com/analytics.js','ga'); + ga('create', 'UA-45382226-1', 'madlib.apache.org'); + ga('send', 'pageview'); +</script> +</head> +<body> +<div id="top"><!-- do not remove this div, it is closed by doxygen! --> +<div id="titlearea"> +<table cellspacing="0" cellpadding="0"> + <tbody> + <tr style="height: 56px;"> + <td id="projectlogo"><a href="http://madlib.apache.org"><img alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td> + <td style="padding-left: 0.5em;"> + <div id="projectname"> + <span id="projectnumber">1.15.1</span> + </div> + <div id="projectbrief">User Documentation for Apache MADlib</div> + </td> + <td> <div id="MSearchBox" class="MSearchBoxInactive"> + <span class="left"> + <img id="MSearchSelect" src="search/mag_sel.png" + onmouseover="return searchBox.OnSearchSelectShow()" + onmouseout="return searchBox.OnSearchSelectHide()" + alt=""/> + <input type="text" id="MSearchField" value="Search" accesskey="S" + onfocus="searchBox.OnSearchFieldFocus(true)" + onblur="searchBox.OnSearchFieldFocus(false)" + onkeyup="searchBox.OnSearchFieldChange(event)"/> + </span><span class="right"> + <a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a> + </span> + </div> +</td> + </tr> + </tbody> +</table> +</div> +<!-- end header part --> +<!-- Generated by Doxygen 1.8.14 --> +<script type="text/javascript"> +/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&dn=gpl-2.0.txt GPL-v2 */ +var searchBox = new SearchBox("searchBox", "search",false,'Search'); +/* @license-end */ +</script> +</div><!-- top --> +<div id="side-nav" class="ui-resizable side-nav-resizable"> + <div id="nav-tree"> + <div id="nav-tree-contents"> + <div id="nav-sync" class="sync"></div> + </div> + </div> + <div id="splitbar" style="-moz-user-select:none;" + class="ui-resizable-handle"> + </div> +</div> +<script type="text/javascript"> +/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&dn=gpl-2.0.txt GPL-v2 */ +$(document).ready(function(){initNavTree('group__grp__datatrans.html','');}); +/* @license-end */ +</script> +<div id="doc-content"> +<!-- window showing the filter options --> +<div id="MSearchSelectWindow" + onmouseover="return searchBox.OnSearchSelectShow()" + onmouseout="return searchBox.OnSearchSelectHide()" + onkeydown="return searchBox.OnSearchSelectKey(event)"> +</div> + +<!-- iframe showing the search results (closed by default) --> +<div id="MSearchResultsWindow"> +<iframe src="javascript:void(0)" frameborder="0" + name="MSearchResults" id="MSearchResults"> +</iframe> +</div> + +<div class="header"> + <div class="summary"> +<a href="#groups">Modules</a> </div> + <div class="headertitle"> +<div class="title">Data Types and Transformations</div> </div> +</div><!--header--> +<div class="contents"> +<a name="details" id="details"></a><h2 class="groupheader">Detailed Description</h2> +<p>Data types and operations that transform and shape data. </p> +<table class="memberdecls"> +<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="groups"></a> +Modules</h2></td></tr> +<tr class="memitem:group__grp__arraysmatrix"><td class="memItemLeft" align="right" valign="top"> </td><td class="memItemRight" valign="bottom"><a class="el" href="group__grp__arraysmatrix.html">Arrays and Matrices</a></td></tr> +<tr class="memdesc:group__grp__arraysmatrix"><td class="mdescLeft"> </td><td class="mdescRight">Mathematical operations for arrays and matrices. <br /></td></tr> +<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr> +<tr class="memitem:group__grp__encode__categorical"><td class="memItemLeft" align="right" valign="top"> </td><td class="memItemRight" valign="bottom"><a class="el" href="group__grp__encode__categorical.html">Encoding Categorical Variables</a></td></tr> +<tr class="memdesc:group__grp__encode__categorical"><td class="mdescLeft"> </td><td class="mdescRight">Functions to encode categorical variables to prepare data for input into predictive algorithms. <br /></td></tr> +<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr> +<tr class="memitem:group__grp__path"><td class="memItemLeft" align="right" valign="top"> </td><td class="memItemRight" valign="bottom"><a class="el" href="group__grp__path.html">Path</a></td></tr> +<tr class="memdesc:group__grp__path"><td class="mdescLeft"> </td><td class="mdescRight">A function to perform complex pattern matching across rows and extract useful information about the matches. <br /></td></tr> +<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr> +<tr class="memitem:group__grp__pivot"><td class="memItemLeft" align="right" valign="top"> </td><td class="memItemRight" valign="bottom"><a class="el" href="group__grp__pivot.html">Pivot</a></td></tr> +<tr class="memdesc:group__grp__pivot"><td class="mdescLeft"> </td><td class="mdescRight">Pivoting and data summarization tools for preparing data for modeling operations. <br /></td></tr> +<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr> +<tr class="memitem:group__grp__sessionize"><td class="memItemLeft" align="right" valign="top"> </td><td class="memItemRight" valign="bottom"><a class="el" href="group__grp__sessionize.html">Sessionize</a></td></tr> +<tr class="memdesc:group__grp__sessionize"><td class="mdescLeft"> </td><td class="mdescRight">Session reconstruction of data consisting of a time stampled sequence of events. <br /></td></tr> +<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr> +<tr class="memitem:group__grp__stemmer"><td class="memItemLeft" align="right" valign="top"> </td><td class="memItemRight" valign="bottom"><a class="el" href="group__grp__stemmer.html">Stemming</a></td></tr> +<tr class="memdesc:group__grp__stemmer"><td class="mdescLeft"> </td><td class="mdescRight">Provides porter stemmer operations supporting other MADlib modules. <br /></td></tr> +<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr> +</table> +</div><!-- contents --> +</div><!-- doc-content --> +<!-- start footer part --> +<div id="nav-path" class="navpath"><!-- id is needed for treeview function! --> + <ul> + <li class="footer">Generated on Mon Oct 15 2018 11:24:30 for MADlib by + <a href="http://www.doxygen.org/index.html"> + <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.14 </li> + </ul> +</div> +</body> +</html> http://git-wip-us.apache.org/repos/asf/madlib-site/blob/af0e5f14/docs/v1.15.1/group__grp__datatrans.js ---------------------------------------------------------------------- diff --git a/docs/v1.15.1/group__grp__datatrans.js b/docs/v1.15.1/group__grp__datatrans.js new file mode 100644 index 0000000..4900455 --- /dev/null +++ b/docs/v1.15.1/group__grp__datatrans.js @@ -0,0 +1,9 @@ +var group__grp__datatrans = +[ + [ "Arrays and Matrices", "group__grp__arraysmatrix.html", "group__grp__arraysmatrix" ], + [ "Encoding Categorical Variables", "group__grp__encode__categorical.html", null ], + [ "Path", "group__grp__path.html", null ], + [ "Pivot", "group__grp__pivot.html", null ], + [ "Sessionize", "group__grp__sessionize.html", null ], + [ "Stemming", "group__grp__stemmer.html", null ] +]; \ No newline at end of file
