http://git-wip-us.apache.org/repos/asf/predictionio-site/blob/9d2bd407/demo/textclassification/index.html
----------------------------------------------------------------------
diff --git a/demo/textclassification/index.html
b/demo/textclassification/index.html
index 0d1d516..58a604e 100644
--- a/demo/textclassification/index.html
+++ b/demo/textclassification/index.html
@@ -1,4 +1,4 @@
-<!DOCTYPE html><html><head><title>Text Classification Engine
Tutorial</title><meta charset="utf-8"/><meta content="IE=edge,chrome=1"
http-equiv="X-UA-Compatible"/><meta name="viewport"
content="width=device-width, initial-scale=1.0"/><meta class="swiftype"
name="title" data-type="string" content="Text Classification Engine
Tutorial"/><link rel="canonical"
href="https://predictionio.apache.org/demo/textclassification/"/><link
href="/images/favicon/normal-b330020a.png" rel="shortcut icon"/><link
href="/images/favicon/apple-c0febcf2.png" rel="apple-touch-icon"/><link
href="//fonts.googleapis.com/css?family=Open+Sans:300italic,400italic,600italic,700italic,800italic,400,300,600,700,800"
rel="stylesheet"/><link
href="//maxcdn.bootstrapcdn.com/font-awesome/4.2.0/css/font-awesome.min.css"
rel="stylesheet"/><link href="/stylesheets/application-eccfc6cb.css"
rel="stylesheet" type="text/css"/><script
src="//cdnjs.cloudflare.com/ajax/libs/html5shiv/3.7.2/html5shiv.min.js"></script><script
src=
"//cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script><script
src="//use.typekit.net/pqo0itb.js"></script><script>try{Typekit.load({ async:
true });}catch(e){}</script></head><body><div id="global"><header><div
class="container" id="header-wrapper"><div class="row"><div
class="col-sm-12"><div id="logo-wrapper"><span id="drawer-toggle"></span><a
href="#"></a><a href="http://predictionio.apache.org/"><img alt="Apache
PredictionIO" id="logo"
src="/images/logos/logo-ee2b9bb3.png"/></a><span>®</span></div><div
id="menu-wrapper"><div id="pill-wrapper"><a class="pill left"
href="/gallery/template-gallery">TEMPLATES</a> <a class="pill right"
href="//github.com/apache/incubator-predictionio/">OPEN
SOURCE</a></div></div><img class="mobile-search-bar-toggler hidden-md
hidden-lg"
src="/images/icons/search-glass-704bd4ff.png"/></div></div></div></header><div
id="search-bar-row-wrapper"><div class="container-fluid"
id="search-bar-row"><div class="row"><div class="co
l-md-9 col-sm-11 col-xs-11"><div class="hidden-md hidden-lg"
id="mobile-page-heading-wrapper"><p>PredictionIO Docs</p><h4>Text
Classification Engine Tutorial</h4></div><h4 class="hidden-sm
hidden-xs">PredictionIO Docs</h4></div><div class="col-md-3 col-sm-1 col-xs-1
hidden-md hidden-lg"><img id="left-menu-indicator"
src="/images/icons/down-arrow-dfe9f7fe.png"/></div><div class="col-md-3
col-sm-12 col-xs-12 swiftype-wrapper"><div class="swiftype"><form
class="search-form"><img class="search-box-toggler hidden-xs hidden-sm"
src="/images/icons/search-glass-704bd4ff.png"/><div class="search-box"><img
src="/images/icons/search-glass-704bd4ff.png"/><input type="text"
id="st-search-input" class="st-search-input" placeholder="Search
Doc..."/></div><img class="swiftype-row-hider hidden-md hidden-lg"
src="/images/icons/drawer-toggle-active-fcbef12a.png"/></form></div></div><div
class="mobile-left-menu-toggler hidden-md
hidden-lg"></div></div></div></div><div id="page" class="container-fluid">
<div class="row"><div id="left-menu-wrapper" class="col-md-3"><nav
id="nav-main"><ul><li class="level-1"><a class="expandible"
href="/"><span>Apache PredictionIO® Documentation</span></a><ul><li
class="level-2"><a class="final" href="/"><span>Welcome to Apache
PredictionIO®</span></a></li></ul></li><li class="level-1"><a
class="expandible" href="#"><span>Getting Started</span></a><ul><li
class="level-2"><a class="final" href="/start/"><span>A Quick
Intro</span></a></li><li class="level-2"><a class="final"
href="/install/"><span>Installing Apache PredictionIO</span></a></li><li
class="level-2"><a class="final" href="/start/download/"><span>Downloading an
Engine Template</span></a></li><li class="level-2"><a class="final"
href="/start/deploy/"><span>Deploying Your First Engine</span></a></li><li
class="level-2"><a class="final" href="/start/customize/"><span>Customizing the
Engine</span></a></li></ul></li><li class="level-1"><a class="expandible"
href="#"><span>Integrating with Your
App</span></a><ul><li class="level-2"><a class="final"
href="/appintegration/"><span>App Integration Overview</span></a></li><li
class="level-2"><a class="expandible" href="/sdk/"><span>List of
SDKs</span></a><ul><li class="level-3"><a class="final"
href="/sdk/java/"><span>Java & Android SDK</span></a></li><li
class="level-3"><a class="final" href="/sdk/php/"><span>PHP
SDK</span></a></li><li class="level-3"><a class="final"
href="/sdk/python/"><span>Python SDK</span></a></li><li class="level-3"><a
class="final" href="/sdk/ruby/"><span>Ruby SDK</span></a></li><li
class="level-3"><a class="final" href="/sdk/community/"><span>Community Powered
SDKs</span></a></li></ul></li></ul></li><li class="level-1"><a
class="expandible" href="#"><span>Deploying an Engine</span></a><ul><li
class="level-2"><a class="final" href="/deploy/"><span>Deploying as a Web
Service</span></a></li><li class="level-2"><a class="final"
href="/batchpredict/"><span>Batch Predictions</span></a></li><li class="level-
2"><a class="final" href="/deploy/monitoring/"><span>Monitoring
Engine</span></a></li><li class="level-2"><a class="final"
href="/deploy/engineparams/"><span>Setting Engine Parameters</span></a></li><li
class="level-2"><a class="final" href="/deploy/enginevariants/"><span>Deploying
Multiple Engine Variants</span></a></li><li class="level-2"><a class="final"
href="/deploy/plugin/"><span>Engine Server Plugin</span></a></li></ul></li><li
class="level-1"><a class="expandible" href="#"><span>Customizing an
Engine</span></a><ul><li class="level-2"><a class="final"
href="/customize/"><span>Learning DASE</span></a></li><li class="level-2"><a
class="final" href="/customize/dase/"><span>Implement DASE</span></a></li><li
class="level-2"><a class="final"
href="/customize/troubleshooting/"><span>Troubleshooting Engine
Development</span></a></li><li class="level-2"><a class="final"
href="/api/current/#package"><span>Engine Scala
APIs</span></a></li></ul></li><li class="level-1"><a class="expandib
le" href="#"><span>Collecting and Analyzing Data</span></a><ul><li
class="level-2"><a class="final" href="/datacollection/"><span>Event Server
Overview</span></a></li><li class="level-2"><a class="final"
href="/datacollection/eventapi/"><span>Collecting Data with
REST/SDKs</span></a></li><li class="level-2"><a class="final"
href="/datacollection/eventmodel/"><span>Events Modeling</span></a></li><li
class="level-2"><a class="final"
href="/datacollection/webhooks/"><span>Unifying Multichannel Data with
Webhooks</span></a></li><li class="level-2"><a class="final"
href="/datacollection/channel/"><span>Channel</span></a></li><li
class="level-2"><a class="final"
href="/datacollection/batchimport/"><span>Importing Data in
Batch</span></a></li><li class="level-2"><a class="final"
href="/datacollection/analytics/"><span>Using Analytics
Tools</span></a></li><li class="level-2"><a class="final"
href="/datacollection/plugin/"><span>Event Server
Plugin</span></a></li></ul></li><li class="level-1
"><a class="expandible" href="#"><span>Choosing an
Algorithm(s)</span></a><ul><li class="level-2"><a class="final"
href="/algorithm/"><span>Built-in Algorithm Libraries</span></a></li><li
class="level-2"><a class="final" href="/algorithm/switch/"><span>Switching to
Another Algorithm</span></a></li><li class="level-2"><a class="final"
href="/algorithm/multiple/"><span>Combining Multiple
Algorithms</span></a></li><li class="level-2"><a class="final"
href="/algorithm/custom/"><span>Adding Your Own
Algorithms</span></a></li></ul></li><li class="level-1"><a class="expandible"
href="#"><span>ML Tuning and Evaluation</span></a><ul><li class="level-2"><a
class="final" href="/evaluation/"><span>Overview</span></a></li><li
class="level-2"><a class="final"
href="/evaluation/paramtuning/"><span>Hyperparameter Tuning</span></a></li><li
class="level-2"><a class="final"
href="/evaluation/evaluationdashboard/"><span>Evaluation
Dashboard</span></a></li><li class="level-2"><a class="final" href="/eva
luation/metricchoose/"><span>Choosing Evaluation Metrics</span></a></li><li
class="level-2"><a class="final" href="/evaluation/metricbuild/"><span>Building
Evaluation Metrics</span></a></li></ul></li><li class="level-1"><a
class="expandible" href="#"><span>System Architecture</span></a><ul><li
class="level-2"><a class="final" href="/system/"><span>Architecture
Overview</span></a></li><li class="level-2"><a class="final"
href="/system/anotherdatastore/"><span>Using Another Data
Store</span></a></li></ul></li><li class="level-1"><a class="expandible"
href="#"><span>PredictionIO® Official Templates</span></a><ul><li
class="level-2"><a class="final"
href="/templates/"><span>Intro</span></a></li><li class="level-2"><a
class="expandible" href="#"><span>Recommendation</span></a><ul><li
class="level-3"><a class="final"
href="/templates/recommendation/quickstart/"><span>Quick
Start</span></a></li><li class="level-3"><a class="final"
href="/templates/recommendation/dase/"><span>DASE</span></
a></li><li class="level-3"><a class="final"
href="/templates/recommendation/evaluation/"><span>Evaluation
Explained</span></a></li><li class="level-3"><a class="final"
href="/templates/recommendation/how-to/"><span>How-To</span></a></li><li
class="level-3"><a class="final"
href="/templates/recommendation/reading-custom-events/"><span>Read Custom
Events</span></a></li><li class="level-3"><a class="final"
href="/templates/recommendation/customize-data-prep/"><span>Customize Data
Preparator</span></a></li><li class="level-3"><a class="final"
href="/templates/recommendation/customize-serving/"><span>Customize
Serving</span></a></li><li class="level-3"><a class="final"
href="/templates/recommendation/training-with-implicit-preference/"><span>Train
with Implicit Preference</span></a></li><li class="level-3"><a class="final"
href="/templates/recommendation/blacklist-items/"><span>Filter Recommended
Items by Blacklist in Query</span></a></li><li class="level-3"><a class="final"
href="/templ
ates/recommendation/batch-evaluator/"><span>Batch Persistable
Evaluator</span></a></li></ul></li><li class="level-2"><a class="expandible"
href="#"><span>E-Commerce Recommendation</span></a><ul><li class="level-3"><a
class="final" href="/templates/ecommercerecommendation/quickstart/"><span>Quick
Start</span></a></li><li class="level-3"><a class="final"
href="/templates/ecommercerecommendation/dase/"><span>DASE</span></a></li><li
class="level-3"><a class="final"
href="/templates/ecommercerecommendation/how-to/"><span>How-To</span></a></li><li
class="level-3"><a class="final"
href="/templates/ecommercerecommendation/train-with-rate-event/"><span>Train
with Rate Event</span></a></li><li class="level-3"><a class="final"
href="/templates/ecommercerecommendation/adjust-score/"><span>Adjust
Score</span></a></li></ul></li><li class="level-2"><a class="expandible"
href="#"><span>Similar Product</span></a><ul><li class="level-3"><a
class="final" href="/templates/similarproduct/quickstart/"><s
pan>Quick Start</span></a></li><li class="level-3"><a class="final"
href="/templates/similarproduct/dase/"><span>DASE</span></a></li><li
class="level-3"><a class="final"
href="/templates/similarproduct/how-to/"><span>How-To</span></a></li><li
class="level-3"><a class="final"
href="/templates/similarproduct/multi-events-multi-algos/"><span>Multiple
Events and Multiple Algorithms</span></a></li><li class="level-3"><a
class="final"
href="/templates/similarproduct/return-item-properties/"><span>Returns Item
Properties</span></a></li><li class="level-3"><a class="final"
href="/templates/similarproduct/train-with-rate-event/"><span>Train with Rate
Event</span></a></li><li class="level-3"><a class="final"
href="/templates/similarproduct/rid-user-set-event/"><span>Get Rid of Events
for Users</span></a></li><li class="level-3"><a class="final"
href="/templates/similarproduct/recommended-user/"><span>Recommend
Users</span></a></li></ul></li><li class="level-2"><a class="expandible"
href="#"><
span>Classification</span></a><ul><li class="level-3"><a class="final"
href="/templates/classification/quickstart/"><span>Quick
Start</span></a></li><li class="level-3"><a class="final"
href="/templates/classification/dase/"><span>DASE</span></a></li><li
class="level-3"><a class="final"
href="/templates/classification/how-to/"><span>How-To</span></a></li><li
class="level-3"><a class="final"
href="/templates/classification/add-algorithm/"><span>Use Alternative
Algorithm</span></a></li><li class="level-3"><a class="final"
href="/templates/classification/reading-custom-properties/"><span>Read Custom
Properties</span></a></li></ul></li></ul></li><li class="level-1"><a
class="expandible" href="#"><span>Engine Template Gallery</span></a><ul><li
class="level-2"><a class="final"
href="/gallery/template-gallery/"><span>Browse</span></a></li><li
class="level-2"><a class="final"
href="/community/submit-template/"><span>Submit your Engine as a
Template</span></a></li></ul></li><li class="level-
1"><a class="expandible" href="#"><span>Demo Tutorials</span></a><ul><li
class="level-2"><a class="final" href="/demo/tapster/"><span>Comics
Recommendation Demo</span></a></li><li class="level-2"><a class="final"
href="/demo/community/"><span>Community Contributed Demo</span></a></li><li
class="level-2"><a class="final active"
href="/demo/textclassification/"><span>Text Classification Engine
Tutorial</span></a></li></ul></li><li class="level-1"><a class="expandible"
href="/community/"><span>Getting Involved</span></a><ul><li class="level-2"><a
class="final" href="/community/contribute-code/"><span>Contribute
Code</span></a></li><li class="level-2"><a class="final"
href="/community/contribute-documentation/"><span>Contribute
Documentation</span></a></li><li class="level-2"><a class="final"
href="/community/contribute-sdk/"><span>Contribute a SDK</span></a></li><li
class="level-2"><a class="final"
href="/community/contribute-webhook/"><span>Contribute a
Webhook</span></a></li><li clas
s="level-2"><a class="final" href="/community/projects/"><span>Community
Projects</span></a></li></ul></li><li class="level-1"><a class="expandible"
href="#"><span>Getting Help</span></a><ul><li class="level-2"><a class="final"
href="/resources/faq/"><span>FAQs</span></a></li><li class="level-2"><a
class="final" href="/support/"><span>Support</span></a></li></ul></li><li
class="level-1"><a class="expandible"
href="#"><span>Resources</span></a><ul><li class="level-2"><a class="final"
href="/cli/"><span>Command-line Interface</span></a></li><li class="level-2"><a
class="final" href="/resources/release/"><span>Release
Cadence</span></a></li><li class="level-2"><a class="final"
href="/resources/intellij/"><span>Developing Engines with IntelliJ
IDEA</span></a></li><li class="level-2"><a class="final"
href="/resources/upgrade/"><span>Upgrade Instructions</span></a></li><li
class="level-2"><a class="final"
href="/resources/glossary/"><span>Glossary</span></a></li></ul></li><li
class="level
-1"><a class="expandible" href="#"><span>Apache Software
Foundation</span></a><ul><li class="level-2"><a class="final"
href="https://www.apache.org/"><span>Apache Homepage</span></a></li><li
class="level-2"><a class="final"
href="https://www.apache.org/licenses/"><span>License</span></a></li><li
class="level-2"><a class="final"
href="https://www.apache.org/foundation/sponsorship.html"><span>Sponsorship</span></a></li><li
class="level-2"><a class="final"
href="https://www.apache.org/foundation/thanks.html"><span>Thanks</span></a></li><li
class="level-2"><a class="final"
href="https://www.apache.org/security/"><span>Security</span></a></li></ul></li></ul></nav></div><div
class="col-md-9 col-sm-12"><div class="content-header hidden-md
hidden-lg"><div id="breadcrumbs" class="hidden-sm hidden xs"><ul><li><a
href="#">Demo Tutorials</a><span class="spacer">></span></li><li><span
class="last">Text Classification Engine Tutorial</span></li></ul></div><div
id="page-title"><h1>Text Classifi
cation Engine Tutorial</h1></div></div><div
id="table-of-content-wrapper"><h5>On this page</h5><aside
id="table-of-contents"><ul> <li> <a href="#introduction">Introduction</a> </li>
<li> <a href="#prerequisites">Prerequisites</a> </li> <li> <a
href="#engine-overview">Engine Overview</a> </li> <li> <a
href="#quick-start">Quick Start</a> </li> </ul> </li> <li> <a
href="#detailed-explanation-of-dase">Detailed Explanation of DASE</a> <ul> <li>
<a href="#importing-data">Importing Data</a> </li> <li> <a
href="#data-source-reading-event-data">Data Source: Reading Event Data</a>
</li> <li> <a href="#preparator-data-processing-with-dase">Preparator : Data
Processing With DASE</a> </li> <li> <a href="#algorithm-component">Algorithm
Component</a> </li> <li> <a
href="#serving-delivering-the-final-prediction">Serving: Delivering the Final
Prediction</a> </li> <li> <a
href="#evaluation-model-assessment-and-selection">Evaluation: Model Assessment
and Selection</a> </li> <li> <a href="#engine-deplo
yment">Engine Deployment</a> </li> </ul> </aside><hr/><a id="edit-page-link"
href="https://github.com/apache/incubator-predictionio/tree/livedoc/docs/manual/source/demo/textclassification.html.md.erb"><img
src="/images/icons/edit-pencil-d6c1bb3d.png"/>Edit this page</a></div><div
class="content-header hidden-sm hidden-xs"><div id="breadcrumbs"
class="hidden-sm hidden xs"><ul><li><a href="#">Demo Tutorials</a><span
class="spacer">></span></li><li><span class="last">Text Classification
Engine Tutorial</span></li></ul></div><div id="page-title"><h1>Text
Classification Engine Tutorial</h1></div></div><div class="content">
<p>(Updated for Text Classification Template version 3.1)</p><h2
id='introduction' class='header-anchors'>Introduction</h2><p>In the real world,
there are many applications that collect text as data. For example, spam
detectors take email and header content to automatically determine what is or
is not spam; applications can gague the general sentiment in a geographi
cal area by analyzing Twitter data; and news articles can be automatically
categorized based solely on the text content.There are a wide array of machine
learning models you can use to create, or train, a predictive model to assign
an incoming article, or query, to an existing category. Before you can use
these techniques you must first transform the text data (in this case the set
of news articles) into numeric vectors, or feature vectors, that can be used to
train your model.</p><p>The purpose of this tutorial is to illustrate how you
can go about doing this using PredictionIO's platform. The advantages of
using this platform include: a dynamic engine that responds to queries in
real-time; <a
href="http://en.wikipedia.org/wiki/Separation_of_concerns">separation of
concerns</a>, which offers code re-use and maintainability, and distributed
computing capabilities for scalability and efficiency. Moreover, it is easy to
incorporate non-trivial data modeling tasks into the DASE arc
hitecture allowing Data Scientists to focus on tasks related to modeling. This
tutorial will exemplify some of these ideas by guiding you through
PredictionIO's <a
href="/gallery/template-gallery/#natural-language-processing">text
classification template</a>.</p><h2 id='prerequisites'
class='header-anchors'>Prerequisites</h2><p>Before getting started, please make
sure that you have the latest version of Apache PredictionIO <a
href="http://predictionio.apache.org/install/">installed</a>. We emphasize here
that this is an engine template written in <strong>Scala</strong> and can be
more generally thought of as an SBT project containing all the necessary
components.</p><p>You should also download the engine template named Text
Classification Engine that accompanies this tutorial by cloning the template
repository:</p><div class="highlight shell"><table style="border-spacing:
0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre
class="lineno">1</pre></td><td class="cod
e"><pre>git clone
https://github.com/apache/incubator-predictionio-template-text-classifier.git
< Your new engine directory >
+<!DOCTYPE html><html><head><title>Text Classification Engine
Tutorial</title><meta charset="utf-8"/><meta content="IE=edge,chrome=1"
http-equiv="X-UA-Compatible"/><meta name="viewport"
content="width=device-width, initial-scale=1.0"/><meta class="swiftype"
name="title" data-type="string" content="Text Classification Engine
Tutorial"/><link rel="canonical"
href="https://predictionio.apache.org/demo/textclassification/"/><link
href="/images/favicon/normal-b330020a.png" rel="shortcut icon"/><link
href="/images/favicon/apple-c0febcf2.png" rel="apple-touch-icon"/><link
href="//fonts.googleapis.com/css?family=Open+Sans:300italic,400italic,600italic,700italic,800italic,400,300,600,700,800"
rel="stylesheet"/><link
href="//maxcdn.bootstrapcdn.com/font-awesome/4.2.0/css/font-awesome.min.css"
rel="stylesheet"/><link href="/stylesheets/application-eccfc6cb.css"
rel="stylesheet" type="text/css"/><script
src="//cdnjs.cloudflare.com/ajax/libs/html5shiv/3.7.2/html5shiv.min.js"></script><script
src=
"//cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script><script
src="//use.typekit.net/pqo0itb.js"></script><script>try{Typekit.load({ async:
true });}catch(e){}</script></head><body><div id="global"><header><div
class="container" id="header-wrapper"><div class="row"><div
class="col-sm-12"><div id="logo-wrapper"><span id="drawer-toggle"></span><a
href="#"></a><a href="http://predictionio.apache.org/"><img alt="Apache
PredictionIO" id="logo"
src="/images/logos/logo-ee2b9bb3.png"/></a><span>®</span></div><div
id="menu-wrapper"><div id="pill-wrapper"><a class="pill left"
href="/gallery/template-gallery">TEMPLATES</a> <a class="pill right"
href="//github.com/apache/predictionio/">OPEN SOURCE</a></div></div><img
class="mobile-search-bar-toggler hidden-md hidden-lg"
src="/images/icons/search-glass-704bd4ff.png"/></div></div></div></header><div
id="search-bar-row-wrapper"><div class="container-fluid"
id="search-bar-row"><div class="row"><div class="col-md-9 col
-sm-11 col-xs-11"><div class="hidden-md hidden-lg"
id="mobile-page-heading-wrapper"><p>PredictionIO Docs</p><h4>Text
Classification Engine Tutorial</h4></div><h4 class="hidden-sm
hidden-xs">PredictionIO Docs</h4></div><div class="col-md-3 col-sm-1 col-xs-1
hidden-md hidden-lg"><img id="left-menu-indicator"
src="/images/icons/down-arrow-dfe9f7fe.png"/></div><div class="col-md-3
col-sm-12 col-xs-12 swiftype-wrapper"><div class="swiftype"><form
class="search-form"><img class="search-box-toggler hidden-xs hidden-sm"
src="/images/icons/search-glass-704bd4ff.png"/><div class="search-box"><img
src="/images/icons/search-glass-704bd4ff.png"/><input type="text"
id="st-search-input" class="st-search-input" placeholder="Search
Doc..."/></div><img class="swiftype-row-hider hidden-md hidden-lg"
src="/images/icons/drawer-toggle-active-fcbef12a.png"/></form></div></div><div
class="mobile-left-menu-toggler hidden-md
hidden-lg"></div></div></div></div><div id="page" class="container-fluid"><div
class
="row"><div id="left-menu-wrapper" class="col-md-3"><nav id="nav-main"><ul><li
class="level-1"><a class="expandible" href="/"><span>Apache PredictionIO®
Documentation</span></a><ul><li class="level-2"><a class="final"
href="/"><span>Welcome to Apache PredictionIO®</span></a></li></ul></li><li
class="level-1"><a class="expandible" href="#"><span>Getting
Started</span></a><ul><li class="level-2"><a class="final"
href="/start/"><span>A Quick Intro</span></a></li><li class="level-2"><a
class="final" href="/install/"><span>Installing Apache
PredictionIO</span></a></li><li class="level-2"><a class="final"
href="/start/download/"><span>Downloading an Engine Template</span></a></li><li
class="level-2"><a class="final" href="/start/deploy/"><span>Deploying Your
First Engine</span></a></li><li class="level-2"><a class="final"
href="/start/customize/"><span>Customizing the
Engine</span></a></li></ul></li><li class="level-1"><a class="expandible"
href="#"><span>Integrating with Your App</span
></a><ul><li class="level-2"><a class="final"
>href="/appintegration/"><span>App Integration Overview</span></a></li><li
>class="level-2"><a class="expandible" href="/sdk/"><span>List of
>SDKs</span></a><ul><li class="level-3"><a class="final"
>href="/sdk/java/"><span>Java & Android SDK</span></a></li><li
>class="level-3"><a class="final" href="/sdk/php/"><span>PHP
>SDK</span></a></li><li class="level-3"><a class="final"
>href="/sdk/python/"><span>Python SDK</span></a></li><li class="level-3"><a
>class="final" href="/sdk/ruby/"><span>Ruby SDK</span></a></li><li
>class="level-3"><a class="final" href="/sdk/community/"><span>Community
>Powered SDKs</span></a></li></ul></li></ul></li><li class="level-1"><a
>class="expandible" href="#"><span>Deploying an Engine</span></a><ul><li
>class="level-2"><a class="final" href="/deploy/"><span>Deploying as a Web
>Service</span></a></li><li class="level-2"><a class="final"
>href="/batchpredict/"><span>Batch Predictions</span></a></li><li
>class="level-2"><a clas
s="final" href="/deploy/monitoring/"><span>Monitoring
Engine</span></a></li><li class="level-2"><a class="final"
href="/deploy/engineparams/"><span>Setting Engine Parameters</span></a></li><li
class="level-2"><a class="final" href="/deploy/enginevariants/"><span>Deploying
Multiple Engine Variants</span></a></li><li class="level-2"><a class="final"
href="/deploy/plugin/"><span>Engine Server Plugin</span></a></li></ul></li><li
class="level-1"><a class="expandible" href="#"><span>Customizing an
Engine</span></a><ul><li class="level-2"><a class="final"
href="/customize/"><span>Learning DASE</span></a></li><li class="level-2"><a
class="final" href="/customize/dase/"><span>Implement DASE</span></a></li><li
class="level-2"><a class="final"
href="/customize/troubleshooting/"><span>Troubleshooting Engine
Development</span></a></li><li class="level-2"><a class="final"
href="/api/current/#package"><span>Engine Scala
APIs</span></a></li></ul></li><li class="level-1"><a class="expandible" href="
#"><span>Collecting and Analyzing Data</span></a><ul><li class="level-2"><a
class="final" href="/datacollection/"><span>Event Server
Overview</span></a></li><li class="level-2"><a class="final"
href="/datacollection/eventapi/"><span>Collecting Data with
REST/SDKs</span></a></li><li class="level-2"><a class="final"
href="/datacollection/eventmodel/"><span>Events Modeling</span></a></li><li
class="level-2"><a class="final"
href="/datacollection/webhooks/"><span>Unifying Multichannel Data with
Webhooks</span></a></li><li class="level-2"><a class="final"
href="/datacollection/channel/"><span>Channel</span></a></li><li
class="level-2"><a class="final"
href="/datacollection/batchimport/"><span>Importing Data in
Batch</span></a></li><li class="level-2"><a class="final"
href="/datacollection/analytics/"><span>Using Analytics
Tools</span></a></li><li class="level-2"><a class="final"
href="/datacollection/plugin/"><span>Event Server
Plugin</span></a></li></ul></li><li class="level-1"><a class
="expandible" href="#"><span>Choosing an Algorithm(s)</span></a><ul><li
class="level-2"><a class="final" href="/algorithm/"><span>Built-in Algorithm
Libraries</span></a></li><li class="level-2"><a class="final"
href="/algorithm/switch/"><span>Switching to Another
Algorithm</span></a></li><li class="level-2"><a class="final"
href="/algorithm/multiple/"><span>Combining Multiple
Algorithms</span></a></li><li class="level-2"><a class="final"
href="/algorithm/custom/"><span>Adding Your Own
Algorithms</span></a></li></ul></li><li class="level-1"><a class="expandible"
href="#"><span>ML Tuning and Evaluation</span></a><ul><li class="level-2"><a
class="final" href="/evaluation/"><span>Overview</span></a></li><li
class="level-2"><a class="final"
href="/evaluation/paramtuning/"><span>Hyperparameter Tuning</span></a></li><li
class="level-2"><a class="final"
href="/evaluation/evaluationdashboard/"><span>Evaluation
Dashboard</span></a></li><li class="level-2"><a class="final"
href="/evaluation/me
tricchoose/"><span>Choosing Evaluation Metrics</span></a></li><li
class="level-2"><a class="final" href="/evaluation/metricbuild/"><span>Building
Evaluation Metrics</span></a></li></ul></li><li class="level-1"><a
class="expandible" href="#"><span>System Architecture</span></a><ul><li
class="level-2"><a class="final" href="/system/"><span>Architecture
Overview</span></a></li><li class="level-2"><a class="final"
href="/system/anotherdatastore/"><span>Using Another Data
Store</span></a></li></ul></li><li class="level-1"><a class="expandible"
href="#"><span>PredictionIO® Official Templates</span></a><ul><li
class="level-2"><a class="final"
href="/templates/"><span>Intro</span></a></li><li class="level-2"><a
class="expandible" href="#"><span>Recommendation</span></a><ul><li
class="level-3"><a class="final"
href="/templates/recommendation/quickstart/"><span>Quick
Start</span></a></li><li class="level-3"><a class="final"
href="/templates/recommendation/dase/"><span>DASE</span></a></li><li
class="level-3"><a class="final"
href="/templates/recommendation/evaluation/"><span>Evaluation
Explained</span></a></li><li class="level-3"><a class="final"
href="/templates/recommendation/how-to/"><span>How-To</span></a></li><li
class="level-3"><a class="final"
href="/templates/recommendation/reading-custom-events/"><span>Read Custom
Events</span></a></li><li class="level-3"><a class="final"
href="/templates/recommendation/customize-data-prep/"><span>Customize Data
Preparator</span></a></li><li class="level-3"><a class="final"
href="/templates/recommendation/customize-serving/"><span>Customize
Serving</span></a></li><li class="level-3"><a class="final"
href="/templates/recommendation/training-with-implicit-preference/"><span>Train
with Implicit Preference</span></a></li><li class="level-3"><a class="final"
href="/templates/recommendation/blacklist-items/"><span>Filter Recommended
Items by Blacklist in Query</span></a></li><li class="level-3"><a class="final"
href="/templates/recom
mendation/batch-evaluator/"><span>Batch Persistable
Evaluator</span></a></li></ul></li><li class="level-2"><a class="expandible"
href="#"><span>E-Commerce Recommendation</span></a><ul><li class="level-3"><a
class="final" href="/templates/ecommercerecommendation/quickstart/"><span>Quick
Start</span></a></li><li class="level-3"><a class="final"
href="/templates/ecommercerecommendation/dase/"><span>DASE</span></a></li><li
class="level-3"><a class="final"
href="/templates/ecommercerecommendation/how-to/"><span>How-To</span></a></li><li
class="level-3"><a class="final"
href="/templates/ecommercerecommendation/train-with-rate-event/"><span>Train
with Rate Event</span></a></li><li class="level-3"><a class="final"
href="/templates/ecommercerecommendation/adjust-score/"><span>Adjust
Score</span></a></li></ul></li><li class="level-2"><a class="expandible"
href="#"><span>Similar Product</span></a><ul><li class="level-3"><a
class="final" href="/templates/similarproduct/quickstart/"><span>Quick
Start</span></a></li><li class="level-3"><a class="final"
href="/templates/similarproduct/dase/"><span>DASE</span></a></li><li
class="level-3"><a class="final"
href="/templates/similarproduct/how-to/"><span>How-To</span></a></li><li
class="level-3"><a class="final"
href="/templates/similarproduct/multi-events-multi-algos/"><span>Multiple
Events and Multiple Algorithms</span></a></li><li class="level-3"><a
class="final"
href="/templates/similarproduct/return-item-properties/"><span>Returns Item
Properties</span></a></li><li class="level-3"><a class="final"
href="/templates/similarproduct/train-with-rate-event/"><span>Train with Rate
Event</span></a></li><li class="level-3"><a class="final"
href="/templates/similarproduct/rid-user-set-event/"><span>Get Rid of Events
for Users</span></a></li><li class="level-3"><a class="final"
href="/templates/similarproduct/recommended-user/"><span>Recommend
Users</span></a></li></ul></li><li class="level-2"><a class="expandible"
href="#"><span>Class
ification</span></a><ul><li class="level-3"><a class="final"
href="/templates/classification/quickstart/"><span>Quick
Start</span></a></li><li class="level-3"><a class="final"
href="/templates/classification/dase/"><span>DASE</span></a></li><li
class="level-3"><a class="final"
href="/templates/classification/how-to/"><span>How-To</span></a></li><li
class="level-3"><a class="final"
href="/templates/classification/add-algorithm/"><span>Use Alternative
Algorithm</span></a></li><li class="level-3"><a class="final"
href="/templates/classification/reading-custom-properties/"><span>Read Custom
Properties</span></a></li></ul></li></ul></li><li class="level-1"><a
class="expandible" href="#"><span>Engine Template Gallery</span></a><ul><li
class="level-2"><a class="final"
href="/gallery/template-gallery/"><span>Browse</span></a></li><li
class="level-2"><a class="final"
href="/community/submit-template/"><span>Submit your Engine as a
Template</span></a></li></ul></li><li class="level-1"><a clas
s="expandible" href="#"><span>Demo Tutorials</span></a><ul><li
class="level-2"><a class="final" href="/demo/tapster/"><span>Comics
Recommendation Demo</span></a></li><li class="level-2"><a class="final"
href="/demo/community/"><span>Community Contributed Demo</span></a></li><li
class="level-2"><a class="final active"
href="/demo/textclassification/"><span>Text Classification Engine
Tutorial</span></a></li></ul></li><li class="level-1"><a class="expandible"
href="/community/"><span>Getting Involved</span></a><ul><li class="level-2"><a
class="final" href="/community/contribute-code/"><span>Contribute
Code</span></a></li><li class="level-2"><a class="final"
href="/community/contribute-documentation/"><span>Contribute
Documentation</span></a></li><li class="level-2"><a class="final"
href="/community/contribute-sdk/"><span>Contribute a SDK</span></a></li><li
class="level-2"><a class="final"
href="/community/contribute-webhook/"><span>Contribute a
Webhook</span></a></li><li class="level-2
"><a class="final" href="/community/projects/"><span>Community
Projects</span></a></li></ul></li><li class="level-1"><a class="expandible"
href="#"><span>Getting Help</span></a><ul><li class="level-2"><a class="final"
href="/resources/faq/"><span>FAQs</span></a></li><li class="level-2"><a
class="final" href="/support/"><span>Support</span></a></li></ul></li><li
class="level-1"><a class="expandible"
href="#"><span>Resources</span></a><ul><li class="level-2"><a class="final"
href="/cli/"><span>Command-line Interface</span></a></li><li class="level-2"><a
class="final" href="/resources/release/"><span>Release
Cadence</span></a></li><li class="level-2"><a class="final"
href="/resources/intellij/"><span>Developing Engines with IntelliJ
IDEA</span></a></li><li class="level-2"><a class="final"
href="/resources/upgrade/"><span>Upgrade Instructions</span></a></li><li
class="level-2"><a class="final"
href="/resources/glossary/"><span>Glossary</span></a></li></ul></li><li
class="level-1"><a cla
ss="expandible" href="#"><span>Apache Software Foundation</span></a><ul><li
class="level-2"><a class="final" href="https://www.apache.org/"><span>Apache
Homepage</span></a></li><li class="level-2"><a class="final"
href="https://www.apache.org/licenses/"><span>License</span></a></li><li
class="level-2"><a class="final"
href="https://www.apache.org/foundation/sponsorship.html"><span>Sponsorship</span></a></li><li
class="level-2"><a class="final"
href="https://www.apache.org/foundation/thanks.html"><span>Thanks</span></a></li><li
class="level-2"><a class="final"
href="https://www.apache.org/security/"><span>Security</span></a></li></ul></li></ul></nav></div><div
class="col-md-9 col-sm-12"><div class="content-header hidden-md
hidden-lg"><div id="breadcrumbs" class="hidden-sm hidden xs"><ul><li><a
href="#">Demo Tutorials</a><span class="spacer">></span></li><li><span
class="last">Text Classification Engine Tutorial</span></li></ul></div><div
id="page-title"><h1>Text Classification Eng
ine Tutorial</h1></div></div><div id="table-of-content-wrapper"><h5>On this
page</h5><aside id="table-of-contents"><ul> <li> <a
href="#introduction">Introduction</a> </li> <li> <a
href="#prerequisites">Prerequisites</a> </li> <li> <a
href="#engine-overview">Engine Overview</a> </li> <li> <a
href="#quick-start">Quick Start</a> </li> </ul> </li> <li> <a
href="#detailed-explanation-of-dase">Detailed Explanation of DASE</a> <ul> <li>
<a href="#importing-data">Importing Data</a> </li> <li> <a
href="#data-source-reading-event-data">Data Source: Reading Event Data</a>
</li> <li> <a href="#preparator-data-processing-with-dase">Preparator : Data
Processing With DASE</a> </li> <li> <a href="#algorithm-component">Algorithm
Component</a> </li> <li> <a
href="#serving-delivering-the-final-prediction">Serving: Delivering the Final
Prediction</a> </li> <li> <a
href="#evaluation-model-assessment-and-selection">Evaluation: Model Assessment
and Selection</a> </li> <li> <a href="#engine-deployment">Eng
ine Deployment</a> </li> </ul> </aside><hr/><a id="edit-page-link"
href="https://github.com/apache/predictionio/tree/livedoc/docs/manual/source/demo/textclassification.html.md.erb"><img
src="/images/icons/edit-pencil-d6c1bb3d.png"/>Edit this page</a></div><div
class="content-header hidden-sm hidden-xs"><div id="breadcrumbs"
class="hidden-sm hidden xs"><ul><li><a href="#">Demo Tutorials</a><span
class="spacer">></span></li><li><span class="last">Text Classification
Engine Tutorial</span></li></ul></div><div id="page-title"><h1>Text
Classification Engine Tutorial</h1></div></div><div class="content">
<p>(Updated for Text Classification Template version 3.1)</p><h2
id='introduction' class='header-anchors'>Introduction</h2><p>In the real world,
there are many applications that collect text as data. For example, spam
detectors take email and header content to automatically determine what is or
is not spam; applications can gague the general sentiment in a geographical
area by analyzin
g Twitter data; and news articles can be automatically categorized based
solely on the text content.There are a wide array of machine learning models
you can use to create, or train, a predictive model to assign an incoming
article, or query, to an existing category. Before you can use these techniques
you must first transform the text data (in this case the set of news articles)
into numeric vectors, or feature vectors, that can be used to train your
model.</p><p>The purpose of this tutorial is to illustrate how you can go about
doing this using PredictionIO's platform. The advantages of using this
platform include: a dynamic engine that responds to queries in real-time; <a
href="http://en.wikipedia.org/wiki/Separation_of_concerns">separation of
concerns</a>, which offers code re-use and maintainability, and distributed
computing capabilities for scalability and efficiency. Moreover, it is easy to
incorporate non-trivial data modeling tasks into the DASE architecture allowing
D
ata Scientists to focus on tasks related to modeling. This tutorial will
exemplify some of these ideas by guiding you through PredictionIO's <a
href="/gallery/template-gallery/#natural-language-processing">text
classification template</a>.</p><h2 id='prerequisites'
class='header-anchors'>Prerequisites</h2><p>Before getting started, please make
sure that you have the latest version of Apache PredictionIO <a
href="http://predictionio.apache.org/install/">installed</a>. We emphasize here
that this is an engine template written in <strong>Scala</strong> and can be
more generally thought of as an SBT project containing all the necessary
components.</p><p>You should also download the engine template named Text
Classification Engine that accompanies this tutorial by cloning the template
repository:</p><div class="highlight shell"><table style="border-spacing:
0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre
class="lineno">1</pre></td><td class="code"><pre>git clone ht
tps://github.com/apache/predictionio-template-text-classifier.git < Your
new engine directory >
</pre></td></tr></tbody></table> </div> <h2 id='engine-overview'
class='header-anchors'>Engine Overview</h2><p>The engine follows the DASE
architecture which we briefly review here. As a user, you are tasked with
collecting data for your web or application, and importing it into
PredictionIO's Event Server. Once the data is in the server, it can be read
and processed by the engine via the Data Source and Preparation components,
respectively. The Algorithm component uses the processed, or prepared, data to
train a set of predictive models. Once you have trained these models, you are
ready to deploy your engine and respond to real-time queries via the Serving
component which combines the results from different fitted models. The
Evaluation component is used to compute an appropriate metric to test the
performance of a fitted model, as well as aid in the tuning of model hyper
parameters.</p><p>This engine template is meant to handle text classification
which means you will be worki
ng with text data. This means that a query, or newly observed documents, will
be of the form:</p><p><code>{text : String}</code>.</p><p>In the running
example, a query would be an incoming news article. Once the engine is deployed
it can process the query, and then return a Predicted Result of the
form</p><p><code>{category : String, confidence : Double}</code>.</p><p>Here
category is the model's class assignment for this new text document (i.e.
the best guess for this article's categorization), and confidence, a value
between 0 and 1 representing your confidence in the category prediction (0
meaning you have no confidence in the prediction). The Actual Result is of the
form</p><p><code>{category : String}</code>.</p><p>This is used in the
evaluation stage when estimating the performance of your predictive model (how
well does the model predict categories). Please refer to the <a
href="https://predictionio.apache.org/customize/">following tutorial</a> for a
more detailed exp
lanation of how your engine will interact with your web application, as well
as an in depth-overview of DASE.</p><h2 id='quick-start'
class='header-anchors'>Quick Start</h2><p>This is a quick start guide in case
you want to start using the engine right away. Sample email data for spam
classification will be used. For more detailed information, read the subsequent
sections.</p><h3 id='1.-create-a-new-application.' class='header-anchors'>1.
Create a new application.</h3><p>After the application is created, you will be
given an access key and application ID for the application.</p><div
class="highlight shell"><table style="border-spacing: 0"><tbody><tr><td
class="gutter gl" style="text-align: right"><pre class="lineno">1</pre></td><td
class="code"><pre><span class="gp">$ </span>pio app new MyTextApp
</pre></td></tr></tbody></table> </div> <h3 id='2.-import-the-tutorial-data.'
class='header-anchors'>2. Import the tutorial data.</h3><p>There are three
different data sets available, each giving a different use case for this
engine. Please refer to the <strong>Data Source: Reading Event Data</strong>
section to see how to appropriate modify the <code>DataSource</code> class for
use with each respective data set. The default data set is an e-mail spam data
set.</p><p>These data sets have already been processed and are ready for <a
href="/datacollection/batchimport/">batch import</a>. Replace <code>***</code>
with your actual application ID.</p><div class="highlight shell"><table
style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align:
right"><pre class="lineno">1
2
@@ -33,7 +33,7 @@
26</pre></td><td class="code"><pre><span class="o">{</span>
<span class="s2">"id"</span>: <span class="s2">"default"</span>,
<span class="s2">"description"</span>: <span class="s2">"Default
settings"</span>,
- <span class="s2">"engineFactory"</span>: <span
class="s2">"org.template.textclassification.TextClassificationEngine"</span>,
+ <span class="s2">"engineFactory"</span>: <span
class="s2">"org.example.textclassification.TextClassificationEngine"</span>,
<span class="s2">"datasource"</span>: <span class="o">{</span>
<span class="s2">"params"</span>: <span class="o">{</span>
<span class="s2">"appName"</span>: <span class="s2">"MyTextApp"</span>
@@ -70,7 +70,7 @@
</pre></td></tr></tbody></table> </div> <p>you should see following outputs
returned by the engine:</p><div class="highlight shell"><table
style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align:
right"><pre class="lineno">1</pre></td><td class="code"><pre><span
class="o">{</span><span class="s2">"category"</span>:<span class="s2">"not
spam"</span>,<span class="s2">"confidence"</span>:0.852619510921587<span
class="o">}</span>
</pre></td></tr></tbody></table> </div> <p>Try another query:</p><div
class="highlight shell"><table style="border-spacing: 0"><tbody><tr><td
class="gutter gl" style="text-align: right"><pre class="lineno">1</pre></td><td
class="code"><pre><span class="gp">$ </span>curl -H <span
class="s2">"Content-Type: application/json"</span> -d <span class="s1">'{
"text":"Earn extra cash!" }'</span> http://localhost:8000/queries.json
</pre></td></tr></tbody></table> </div> <p>you should see following outputs
returned by the engine:</p><div class="highlight shell"><table
style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align:
right"><pre class="lineno">1</pre></td><td class="code"><pre><span
class="o">{</span><span class="s2">"category"</span>:<span
class="s2">"spam"</span>,<span
class="s2">"confidence"</span>:0.5268770133242983<span class="o">}</span>
-</pre></td></tr></tbody></table> </div> <h3
id='5.b.evaluate-your-training-model-and-tune-parameters.'
class='header-anchors'>5.b.Evaluate your training model and tune
parameters.</h3><div class="highlight shell"><table style="border-spacing:
0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre
class="lineno">1</pre></td><td class="code"><pre><span class="gp">$ </span>pio
<span class="nb">eval </span>org.template.textclassification.AccuracyEvaluation
org.template.textclassification.EngineParamsList
+</pre></td></tr></tbody></table> </div> <h3
id='5.b.evaluate-your-training-model-and-tune-parameters.'
class='header-anchors'>5.b.Evaluate your training model and tune
parameters.</h3><div class="highlight shell"><table style="border-spacing:
0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre
class="lineno">1</pre></td><td class="code"><pre><span class="gp">$ </span>pio
<span class="nb">eval </span>org.example.textclassification.AccuracyEvaluation
org.example.textclassification.EngineParamsList
</pre></td></tr></tbody></table> </div> <p><strong>Note:</strong> Training and
evaluation stages are generally different stages of engine development.
Evaluation is there to help you choose the best <a
href="/evaluation/paramtuning/">algorithm parameters</a> to use for training an
engine that is to be deployed as a web service.</p><p>Depending on your needs,
in steps (5.x.) above, you can configure your Spark settings by typing a
command of the form:</p><div class="highlight shell"><table
style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align:
right"><pre class="lineno">1</pre></td><td class="code"><pre><span class="gp">$
</span>pio <span class="nb">command </span>command_parameters -- --master url
--driver-memory <span class="o">{</span>0<span class="o">}</span>G
--executor-memory <span class="o">{</span>1<span class="o">}</span>G --conf
spark.akka.framesize<span class="o">={</span>2<span class="o">}</span>
--total_executor_cores <span class="o">{</span>3<span
class="o">}</span>
</pre></td></tr></tbody></table> </div> <p>Only the latter commands are listed
as these are some of the more commonly modified values. See the <a
href="https://spark.apache.org/docs/latest/spark-standalone.html">Spark
documentation</a> and the <a
href="http://predictionio.apache.org/resources/faq/">PredictionIO FAQ's</a>
for more information.</p><p><strong>Note:</strong> We recommend you set your
driver memory to <code>1G</code> or <code>2G</code> as the data size when
dealing with text can be very large.</p><h1 id='detailed-explanation-of-dase'
class='header-anchors'>Detailed Explanation of DASE</h1><h2 id='importing-data'
class='header-anchors'>Importing Data</h2><p>In the quick start, email spam
classification is used. This template can easily be modified for other types
text classification.</p><p>If you want to import different sets of data, follow
the Quick Start instructions to import data from different files. Make sure
that the Data Source is modified accordingly to matc
h the <code>event</code>, <code>entityType</code>, and <code>properties</code>
fields set for the specific dataset. The following section explains this in
more detail.</p><h2 id='data-source:-reading-event-data'
class='header-anchors'>Data Source: Reading Event Data</h2><p>Now that the data
has been imported into PredictionIO's Event Server, it needs to be read
from storage to be used by the engine. This is precisely what the DataSource
engine component is for, which is implemented in the template script
<code>DataSource.scala</code>. The class <code>Observation</code> serves as a
wrapper for storing the information about a news document needed to train a
model. The attribute label refers to the label of the category a document
belongs to, and text, stores the actual document content as a string. The class
TrainingData is used to store an RDD of Observation objects along with the set
of stop words.</p><p>The class <code>DataSourceParams</code> is used to specify
the parameters n
eeded to read and prepare the data for processing. This class is initialized
with two parameters <code>appName</code> and <code>evalK</code>. The first
parameter specifies your application name (i.e. MyTextApp), which is needed so
that the DataSource component knows where to pull the event data from. The
second parameter is used for model evaluation and specifies the number of folds
to use in <a
href="http://en.wikipedia.org/wiki/Cross-validation_%28statistics%29">cross-validation</a>
when estimating a model performance metric.</p><p>The final and most important
ingredient is the DataSource class. This is initialized with its corresponding
parameter class, and extends <code>PDataSource</code>. This
<strong>must</strong> implement the method <code>readTraining</code> which
returns an instance of type TrainingData. This method completely relies on the
defined private methods readEventData and readStopWords. Both of these
functions read data observations as Event instances, create an R
DD containing these events and finally transforms the RDD of events into an
object of the appropriate type as seen below:</p><div class="highlight
scala"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl"
style="text-align: right"><pre class="lineno">1
2
@@ -493,7 +493,7 @@
23</pre></td><td class="code"><pre><span class="w"> </span><span
class="p">{</span><span class="w">
</span><span class="s2">"id"</span><span class="p">:</span><span class="w">
</span><span class="s2">"default"</span><span class="p">,</span><span class="w">
</span><span class="s2">"description"</span><span class="p">:</span><span
class="w"> </span><span class="s2">"Default settings"</span><span
class="p">,</span><span class="w">
- </span><span class="s2">"engineFactory"</span><span class="p">:</span><span
class="w"> </span><span
class="s2">"org.template.textclassification.TextClassificationEngine"</span><span
class="p">,</span><span class="w">
+ </span><span class="s2">"engineFactory"</span><span class="p">:</span><span
class="w"> </span><span
class="s2">"org.example.textclassification.TextClassificationEngine"</span><span
class="p">,</span><span class="w">
</span><span class="s2">"datasource"</span><span class="p">:</span><span
class="w"> </span><span class="p">{</span><span class="w">
</span><span class="s2">"params"</span><span class="p">:</span><span
class="w"> </span><span class="p">{</span><span class="w">
</span><span class="s2">"appName"</span><span class="p">:</span><span
class="w"> </span><span class="s2">"MyTextApp"</span><span class="w">
@@ -543,11 +543,11 @@
</pre></td></tr></tbody></table> </div> <h2 id='engine-deployment'
class='header-anchors'>Engine Deployment</h2><p>Once an engine is ready for
deployment it can interact with your web application in real-time. This section
will cover how to send and receive queries from your engine, gather more data,
and re-training your model with the newly gathered data.</p><h3
id='sending-queries' class='header-anchors'>Sending Queries</h3><p>Recall that
one of the greatest advantages of using the PredictionIO platform is that once
your engine is deployed, you can respond to queries in real-time. Recall that
our queries are of the form</p><p><code>{"text" :
"..."}</code>.</p><p>To actually send a query you can use our REST
API by typing in the following shell command:</p><div class="highlight
shell"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl"
style="text-align: right"><pre class="lineno">1</pre></td><td
class="code"><pre>curl -H <span class="s2">"Content-
Type: application/json"</span> -d <span class="s1">'{ "text":"I like speed and
fast motorcycles." }'</span> http://localhost:8000/queries.json
</pre></td></tr></tbody></table> </div> <p>There are a number of <a
href="https://github.com/PredictionIO">SDK's</a> you can use to send your
queries and obtain a response. Recall that our predicted response is of the
form</p><div class="highlight shell"><table style="border-spacing:
0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre
class="lineno">1</pre></td><td class="code"><pre><span class="o">{</span><span
class="s2">"category"</span> : <span class="s2">"class"</span>, <span
class="s2">"confidence"</span> : 1.0<span class="o">}</span>
</pre></td></tr></tbody></table> </div> <p>which is what you should see upon
inputting the latter command for querying.</p><h3
id='gathering-more-data-and-retraining-your-model'
class='header-anchors'>Gathering More Data and Retraining Your Model</h3><p>The
importing data section that is included in this tutorial uses a sample data set
for illustration purposes, and uses the PredictionIO Python SDK to import the
data. However, there are a variety of ways that you can <a
href="//predictionio.apache.org/datacollection/eventapi/">import</a> your
collected data (via REST or other SDKs).</p><p>As you continue to collect your
data, it is quite easy to retrain your model once you actually import your data
into the Event Server. You simply repeat the steps listed in the Quick Start
guide. We re-list them here again:</p><p><strong>1.</strong> Build your
engine.</p><div class="highlight shell"><table style="border-spacing:
0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre cla
ss="lineno">1</pre></td><td class="code"><pre><span class="gp">$ </span>pio
build
-</pre></td></tr></tbody></table> </div> <p><strong>2.a.</strong> Evaluate your
training model and tune parameters.</p><div class="highlight shell"><table
style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align:
right"><pre class="lineno">1</pre></td><td class="code"><pre><span class="gp">$
</span>pio <span class="nb">eval
</span>org.template.textclassification.AccuracyEvaluation
org.template.textclassification.EngineParamsList
+</pre></td></tr></tbody></table> </div> <p><strong>2.a.</strong> Evaluate your
training model and tune parameters.</p><div class="highlight shell"><table
style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align:
right"><pre class="lineno">1</pre></td><td class="code"><pre><span class="gp">$
</span>pio <span class="nb">eval
</span>org.example.textclassification.AccuracyEvaluation
org.example.textclassification.EngineParamsList
</pre></td></tr></tbody></table> </div> <p><strong>2.b.</strong> Train your
model and deploy.</p><div class="highlight shell"><table style="border-spacing:
0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre
class="lineno">1
2</pre></td><td class="code"><pre><span class="gp">$ </span>pio train
<span class="gp">$ </span>pio deploy
-</pre></td></tr></tbody></table> </div> </div></div></div></div><footer><div
class="container"><div class="seperator"></div><div class="row"><div
class="col-md-6 footer-link-column"><div
class="footer-link-column-row"><h4>Community</h4><ul><li><a
href="//predictionio.apache.org/install/"
target="blank">Download</a></li><li><a href="//predictionio.apache.org/"
target="blank">Docs</a></li><li><a
href="//github.com/apache/incubator-predictionio"
target="blank">GitHub</a></li><li><a
href="mailto:[email protected]" target="blank">Subscribe
to User Mailing List</a></li><li><a
href="//stackoverflow.com/questions/tagged/predictionio"
target="blank">Stackoverflow</a></li></ul></div></div><div class="col-md-6
footer-link-column"><div
class="footer-link-column-row"><h4>Contribute</h4><ul><li><a
href="//predictionio.apache.org/community/contribute-code/"
target="blank">Contribute</a></li><li><a
href="//github.com/apache/incubator-predictionio" target="blank">Source Code</a>
</li><li><a href="//issues.apache.org/jira/browse/PIO" target="blank">Bug
Tracker</a></li><li><a href="mailto:[email protected]"
target="blank">Subscribe to Development Mailing
List</a></li></ul></div></div></div><div class="row"><div class="col-md-12
footer-link-column"><p>Apache PredictionIO, PredictionIO, Apache, the Apache
feather logo, and the Apache PredictionIO project logo are either registered
trademarks or trademarks of The Apache Software Foundation in the United States
and other countries.</p><p>All other marks mentioned may be trademarks or
registered trademarks of their respective owners.</p></div></div></div><div
id="footer-bottom"><div class="container"><div class="row"><div
class="col-md-12"><div id="footer-logo-wrapper"><img alt="PredictionIO"
src="/images/logos/logo-white-d1e9c6e6.png"/><span>®</span></div><div
id="social-icons-wrapper"><a class="github-button"
href="https://github.com/apache/incubator-predictionio" data-style="mega"
data-coun
t-href="/apache/incubator-predictionio/stargazers"
data-count-api="/repos/apache/incubator-predictionio#stargazers_count"
data-count-aria-label="# stargazers on GitHub" aria-label="Star
apache/incubator-predictionio on GitHub">Star</a> <a class="github-button"
href="https://github.com/apache/incubator-predictionio/fork"
data-icon="octicon-git-branch" data-style="mega"
data-count-href="/apache/incubator-predictionio/network"
data-count-api="/repos/apache/incubator-predictionio#forks_count"
data-count-aria-label="# forks on GitHub" aria-label="Fork
apache/incubator-predictionio on GitHub">Fork</a> <script id="github-bjs"
async="" defer="" src="https://buttons.github.io/buttons.js"></script><a
href="https://twitter.com/predictionio" target="blank"><img alt="PredictionIO
on Twitter" src="/images/icons/twitter-ea9dc152.png"/></a> <a
href="https://www.facebook.com/predictionio" target="blank"><img
alt="PredictionIO on Facebook" src="/images/icons/facebook-5c57939c.png"/></a>
</div></div><
/div></div></div></footer></div><script>(function(w,d,t,u,n,s,e){w['SwiftypeObject']=n;w[n]=w[n]||function(){
+</pre></td></tr></tbody></table> </div> </div></div></div></div><footer><div
class="container"><div class="seperator"></div><div class="row"><div
class="col-md-6 footer-link-column"><div
class="footer-link-column-row"><h4>Community</h4><ul><li><a
href="//predictionio.apache.org/install/"
target="blank">Download</a></li><li><a href="//predictionio.apache.org/"
target="blank">Docs</a></li><li><a href="//github.com/apache/predictionio"
target="blank">GitHub</a></li><li><a
href="mailto:[email protected]" target="blank">Subscribe
to User Mailing List</a></li><li><a
href="//stackoverflow.com/questions/tagged/predictionio"
target="blank">Stackoverflow</a></li></ul></div></div><div class="col-md-6
footer-link-column"><div
class="footer-link-column-row"><h4>Contribute</h4><ul><li><a
href="//predictionio.apache.org/community/contribute-code/"
target="blank">Contribute</a></li><li><a
href="//github.com/apache/predictionio" target="blank">Source
Code</a></li><li><a href="//
issues.apache.org/jira/browse/PIO" target="blank">Bug Tracker</a></li><li><a
href="mailto:[email protected]" target="blank">Subscribe to
Development Mailing List</a></li></ul></div></div></div><div class="row"><div
class="col-md-12 footer-link-column"><p>Apache PredictionIO, PredictionIO,
Apache, the Apache feather logo, and the Apache PredictionIO project logo are
either registered trademarks or trademarks of The Apache Software Foundation in
the United States and other countries.</p><p>All other marks mentioned may be
trademarks or registered trademarks of their respective
owners.</p></div></div></div><div id="footer-bottom"><div
class="container"><div class="row"><div class="col-md-12"><div
id="footer-logo-wrapper"><img alt="PredictionIO"
src="/images/logos/logo-white-d1e9c6e6.png"/><span>®</span></div><div
id="social-icons-wrapper"><a class="github-button"
href="https://github.com/apache/predictionio" data-style="mega"
data-count-href="/apache/predictionio/s
targazers" data-count-api="/repos/apache/predictionio#stargazers_count"
data-count-aria-label="# stargazers on GitHub" aria-label="Star
apache/predictionio on GitHub">Star</a> <a class="github-button"
href="https://github.com/apache/predictionio/fork"
data-icon="octicon-git-branch" data-style="mega"
data-count-href="/apache/predictionio/network"
data-count-api="/repos/apache/predictionio#forks_count"
data-count-aria-label="# forks on GitHub" aria-label="Fork apache/predictionio
on GitHub">Fork</a> <script id="github-bjs" async="" defer=""
src="https://buttons.github.io/buttons.js"></script><a
href="https://twitter.com/predictionio" target="blank"><img alt="PredictionIO
on Twitter" src="/images/icons/twitter-ea9dc152.png"/></a> <a
href="https://www.facebook.com/predictionio" target="blank"><img
alt="PredictionIO on Facebook" src="/images/icons/facebook-5c57939c.png"/></a>
</div></div></div></div></div></footer></div><script>(function(w,d,t,u,n,s,e){w['SwiftypeObject']=n;w[n]=w[n]||fu
nction(){
(w[n].q=w[n].q||[]).push(arguments);};s=d.createElement(t);
e=d.getElementsByTagName(t)[0];s.async=1;s.src=u;e.parentNode.insertBefore(s,e);
})(window,document,'script','//s.swiftypecdn.com/install/v1/st.js','_st');