Added: 
uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/tools.textmarker.language.xml
URL: 
http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/tools.textmarker.language.xml?rev=1363750&view=auto
==============================================================================
--- 
uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/tools.textmarker.language.xml
 (added)
+++ 
uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/tools.textmarker.language.xml
 Fri Jul 20 12:27:14 2012
@@ -0,0 +1,626 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE chapter PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
+"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd";[
+<!ENTITY imgroot "images/tools/tools.textmarker/" >
+<!ENTITY % uimaents SYSTEM "../../target/docbook-shared/entities.ent" >  
+%uimaents;
+]>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more 
contributor 
+       license agreements. See the NOTICE file distributed with this work for 
additional 
+       information regarding copyright ownership. The ASF licenses this file 
to 
+       you under the Apache License, Version 2.0 (the "License"); you may not 
use 
+       this file except in compliance with the License. You may obtain a copy 
of 
+       the License at http://www.apache.org/licenses/LICENSE-2.0 Unless 
required 
+       by applicable law or agreed to in writing, software distributed under 
the 
+       License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
CONDITIONS 
+       OF ANY KIND, either express or implied. See the License for the 
specific 
+       language governing permissions and limitations under the License. -->
+
+<chapter id="ugr.tools.tm.language.language">
+       <title>TextMarker Language</title>
+       <para>
+
+       </para>
+
+       <section id="ugr.tools.tm.language.seeding">
+               <title>Basic Annotations and tokens</title>
+               <para>
+                       The TextMarker system uses a JFlex lexer to initially 
create a
+                       seed of
+                       basic, token annotations.
+               </para>
+       </section>
+       <section id="ugr.tools.tm.language.syntax">
+               <title>Syntax</title>
+               <para>
+                       Structure
+                       <programlisting><![CDATA[
+            script                 -> packageDeclaration globalStatements 
statements
+            packageDeclaration     -> "PACKAGE" DottedIdentifier ";"
+            globalStatments        -> globalStatment*   
+            globalStatment         -> ("TYPESYSTEM" | "SCRIPT" | "ENGINE") 
DottedIdentifier ";"
+            statements             -> statement*
+            statement              -> typeDeclaration | resourceDeclaration | 
variableDeclaration 
+                                      | blockDeclaration | simpleStatement
+            ]]></programlisting>
+
+                       Declarations
+                       <programlisting><![CDATA[
+                               typeDeclaration -> "DECLARE" (AnnotationType)? 
Identifier ("," Identifier )*
+                               | "DECLARE" AnnotationType Identifier ( "(" 
featureDeclaration ")" )?
+                               featureDeclaration -> ( (AnnotationType | 
"STRING" | "INT" |
+                               "DOUBLE" | "BOOLEAN") Identifier)+
+                               resourceDeclaration -> ("WORDLIST" Identifier = 
listExpression | "WORDTABLE" Identifier
+                               = tableExpression) ";"
+                               variableDeclaration -> ("TYPE" | "STRING" | 
"INT" | "DOUBLE" | "BOOLEAN") Identifier
+                               ";"
+                               ]]>
+                       </programlisting>
+                       More information about Declarations.
+
+                       Statements
+                       <programlisting><![CDATA[
+            blockDeclaration       -> "BLOCK" "(" Identifier ")" 
ruleElementWithType "{" statements "}"
+            simpleStatement        -> ruleElements ";"
+            ruleElements           -> ( ruleElementWithLiteral  | 
ruleElementWithType )+
+            ruleElementWithLiteral -> simpleStringExpression quantifierPart? 
conditionActionPart?
+            ruleElementWithType    -> typeExpression quantifierPart? 
conditionActionPart?
+            quantifierPart         -> "*" | "*?" | "+" | "+?" | "?" | "??" 
+                                      | "[" numberExpression "," 
numberExpression "]"
+                                      | "[" numberExpression "," 
numberExpression "]?"
+                                      
+            conditionActionPart    -> "{" (condition ( "," condition )*)? ( 
"->" (action( "," action)*))? "}"        
+            condition              -> ConditionName ("(" argument ("," 
argument)* ")")?
+            action                 -> ActionName ("(" argument ("," argument)* 
")")?
+            ]]></programlisting>
+                       More information about Quantifiers,
+                       Conditions, Actions and Blocks.
+                       The ruleElementWithType of a BLOCK declaration must 
have opening
+                       and
+                       closing curly brackets (e.g., BLOCK(name) Document{} 
{...})
+
+                       Expressions
+                       <programlisting><![CDATA[
+            argument                   -> typeExpression | numberExpression | 
stringExpression | booleanExpression
+            typeExpression             -> AnnotationType | TypeVariable
+            numberExpression           -> additiveExpression
+            additiveExpression         -> multiplicativeExpression
+            multiplicativeExpression   -> simpleNumberExpression ( ( "*" | "/" 
| "%" ) simpleNumberExpression )*
+                                          | ( "EXP" | "LOGN" | "SIN" | "COS" | 
"TAN" ) numberExpressionInPar
+            numberExpressionInPar      -> "(" additiveExpression ")"
+            simpleNumberExpression     -> "-"? ( DecimalLiteral | 
FloatingPointLiteral | NumberVariable)
+                                          | numberExpressionInPar      
+            stringExpression           -> simpleStringExpression ( "+" 
simpleSEOrNE )*                   
+            simpleStringExpression     -> StringLiteral | StringVariable
+            simpleSEOrNE               -> simpleStringExpression | 
numberExpressionInPar
+            booleanExpression          -> booleanNumberExpression | 
BooleanVariable | BooleanLiteral
+            booleanNumberExpression    -> "(" numberExpression ( "<" | "<=" | 
">" | ">=" | "==" | "!=" ) numberExpression ")"
+            listExpression             -> Identifier | ResourceLiteral
+            tableExpression            -> Identifier | ResourceLiteral
+            ]]></programlisting>
+                       More information about Expressions. A ResourceLiteral
+                       is something
+                       like 'folder/file.txt' (yes, with single quotes).
+               </para>
+       </section>
+       <section id="ugr.tools.tm.language.inference">
+               <title>Syntax</title>
+               <para>
+                       The inference relies on a complete, disjunctive 
partition of the
+                       document. A basic (minimal) annotation for each element 
of the
+                       partition is assigned to a type of a hierarchy. These 
basic
+                       annotations are enriched for performance reasons with 
information
+                       about annotations that start at the same offset or 
overlap with the
+                       basic annotation. Normally, a scanner creates a basic 
annotation for
+                       each token, punctuation or whitespace, but can also be 
replaced with
+                       a different annotation seeding strategy. Unlike other 
rule-based
+                       information extraction language, the rules are executed 
in an
+                       imperative way. Experience has shown that the 
dependencies between
+                       rules, e.g., the same annotation types in the action 
and in the
+                       condition of a different rule, often form tree-like and 
not
+                       graph-like structures. Therefore, the sequencing and 
imperative
+                       processing did not cause disadvantages, but instead 
obvious
+                       advantages, e.g., the improved understandability of 
large rule sets.
+                       The following algorithm summarizes the rule inference:
+                       <programlisting><![CDATA[
+collect all basic annotations that fulfill the first matching condition
+  for all collected basic annotations do
+    for all rule elements of current rule do
+    if quantifier wants to match then
+      match the conditions of the rule element on the current basic annotation
+      determine the next basic annotation after the current match
+      if quantifier wants to continue then
+        if there is a next basic annotation then
+          continue with the current rule element and the next basic annotation
+        else if rule element did not match then
+          reset the next basic annotation to the current one
+      set the current basic annotation to the next one
+      if some rule elements did not match then
+        stop and continue with the next collected basic annotation
+      else if there is no current basic annotation and the quantifier wants to 
continue then
+        set the current basic annotation to the previous one
+  if all rule elements matched then
+    execute the actions of all rule elements
+]]></programlisting>
+                       The rule elements can of course match on all kinds of 
annotations.
+                       Therefore the determination of the next basic 
annotation returns the
+                       first basic annotation after the last basic annotation 
of the
+                       complete, matched annotation.
+
+               </para>
+       </section>
+       <section id="ugr.tools.tm.language.declarations">
+               <title>Declarations</title>
+               <para>
+
+                       There are three different kinds declaration in the 
TextMarker
+                       system:
+                       Declarations of types with optional feature definitions 
of
+                       that type,
+                       declaration of variables and declarations for importing
+                       external
+                       resources, scripts of UIMA components.
+               </para>
+               <section id="ugr.tools.tm.language.declarations.type">
+                       <title>Type</title>
+                       <para>
+                               Type declarations define new kinds of 
annotations types and
+                               optionally its features.
+
+                               Examples:
+                               <programlisting><![CDATA[
+            DECLARE SimpleType1, SimpleType2; // <- two new types with the 
parent type "Annotation"
+            DECLARE ParentType NewType (SomeType feature1, INT feature2); // 
<- defines a new type "NewType" 
+                // with parent type "ParentType" and two features
+            ]]></programlisting>
+
+                               If the parent type is not defined in the same 
namepace, then the
+                               complete namespace has to be used, e.g., DECLARE
+                               my.other.package.Parent NewType;
+                       </para>
+               </section>
+               <section id="ugr.tools.tm.language.declarations.variable">
+                       <title>Variable</title>
+                       <para>
+                               Variable declarations define new variables. 
There are five kinds of
+                               variables:
+                               * Type variable: A variable that represents an 
annotation
+                               type.
+                               * Integer variable: A variable that represents 
a integer.
+                               *
+                               Double variable: A variable that represents a 
floating-point
+                               number.
+                               * String variable: A variable that represents a 
string.
+                               *
+                               Boolean
+                               variable: A variable that represents a boolean.
+
+                               Examples:
+                               <programlisting><![CDATA[
+                TYPE newTypeVariable;
+                INT newIntegerVariable;
+                DOUBLE newDoubleVariable;
+                STRING newStringVariable;
+                BOOLEAN newBooleanVariable;
+                ]]></programlisting>
+
+                       </para>
+               </section>
+               <section id="ugr.tools.tm.language.declarations.ressource">
+                       <title>Resources</title>
+                       <para>
+
+                               There are two kinds of resource declaration, 
that make external
+                               resources available in hte TextMarker system:
+                               * List: A list
+                               represents a normal text file with an entry per 
line
+                               or a compiled
+                               tree of a word list.
+                               * Table: A table represents comma separated
+                               file.
+
+                               Examples:
+                               <programlisting><![CDATA[
+                LIST Name = 'someWordList.txt';
+                TABLE Name = 'someTable.csv';
+                ]]></programlisting>
+
+                       </para>
+               </section>
+               <section id="ugr.tools.tm.language.declarations.scripts">
+                       <title>Scripts</title>
+                       <para>
+
+                               Additional scripts can be imported and reused 
with the CALL action.
+                               The types of the imported rules are then also 
available, so that it
+                               is not neccessary to import the Type System of 
the additional rule
+                               script.
+
+                               Examples:
+                               <programlisting><![CDATA[
+                SCRIPT my.package.AnotherScript; // <- "AnotherScript.tm" in 
the "my.package" package
+                Document{->CALL(AnotherScript)}; // <- rule executes 
"AnotherScript.tm"
+                ]]></programlisting>
+
+                       </para>
+               </section>
+               <section id="ugr.tools.tm.language.declarations.components">
+                       <title>Components</title>
+                       <para>
+
+                               There are two kind of UIMA components that can 
be imported in a
+                               TextMarker script:
+                               * Type System: includes the types defined in an
+                               external type system.
+                               * Analysis Engine: makes an external analysis
+                               engine available. The
+                               type system needed for the analysis engine has
+                               to be imported
+                               seperately. Please mind the filtering setting 
when
+                               calling an
+                               external analysis engine.
+
+                               Examples:
+                               <programlisting><![CDATA[
+                ENINGE my.package.ExternalEngine; // <- "ExternalEngine.xml" 
in the 
+                    // "my.package" package (in the descriptor folder)
+                TYPESYSTEM my.package.ExternalTypeSystem; // <- 
"ExternalTypeSystem.xml" 
+                    // in the "my.package" package (in the descriptor folder)
+                Document{->RETAINTYPE(SPACE,BREAK),CALL(ExternalEngine)}; 
+                    // calls ExternalEngine, but retains white spaces
+                ]]></programlisting>
+
+                       </para>
+               </section>
+       </section>
+       <section id="ugr.tools.tm.language.quantifier">
+               <title>Quantifiers</title>
+               <para>
+               </para>
+               <section id="ugr.tools.tm.language.quantifier.sg">
+                       <title>* Star Greedy</title>
+                       <para>
+                               The Star Greedy quantifier matches on any 
amount of annotations and
+                               evaluates always true. Please mind, that a rule 
element with a Star
+                               Greedy quantifier needs to match on different 
annotations than the
+                               next rule element.
+
+                               Examples:
+                               <programlisting><![CDATA[
+                Input:    small Big Big Big small
+                Rule:     CW*
+                Matched:  Big Big Big  
+                Matched:  Big Big 
+                Matched:  Big
+                ]]></programlisting>
+
+                       </para>
+               </section>
+               <section id="ugr.tools.tm.language.quantifier.sr">
+                       <title>*? Star Reluctant</title>
+                       <para>
+                               The Star Reluctant quantifier matches on any 
amount of annotations
+                               and evaluates always true, but stops to match 
on new annotations,
+                               when the next rule element matches and 
evaluates true on this
+                               annotation.
+
+                               Examples:
+                               <programlisting><![CDATA[
+                Input:    123 456 small small Big 
+                Rule:     W*? CW
+                Matched:  small small Big
+                Matched:  small Big
+                Matched:  Big
+                ]]></programlisting>
+
+                       </para>
+               </section>
+               <section id="ugr.tools.tm.language.quantifier.pg">
+                       <title>+ Plus Greedy</title>
+                       <para>
+                               The Plus Greedy quantifier needs to match on at 
least one
+                               annotation. Please mind, that a rule element 
after a rule element
+                               with a Plus Greedy quantifier matches and 
evaluates on different
+                               conditions.
+
+                               Examples:
+
+                               <programlisting><![CDATA[
+                Input:    123 456 small small Big 
+                Rule:     SW+ 
+                Matched:  small small
+                Matched:  small 
+                ]]></programlisting>
+
+                       </para>
+               </section>
+               <section id="ugr.tools.tm.language.quantifier.pr">
+                       <title>+? Plus Reluctant</title>
+                       <para>
+                               The Plus Reluctant quantifier has to match on 
at least one
+                               annotation in order to evaluate true, but stops 
when the next rule
+                               element is able to match on this annotation.
+
+                               Examples:
+                               <programlisting><![CDATA[
+                Input:    123 456 small small Big 
+                Rule:     W+? CW
+                Matched:  small small Big
+                ]]></programlisting>
+
+                       </para>
+               </section>
+               <section id="ugr.tools.tm.language.quantifier.qg">
+                       <title>? Question Greedy</title>
+                       <para>
+                               The Question Greedy quantifier matches 
optionally on an annotation
+                               and therefore always evaluates true.
+
+                               Examples:
+                               <programlisting><![CDATA[
+                Input:    123 456 small Big small Big 
+                Rule:     SW CW? SW
+                Matched:  small Big small
+                ]]></programlisting>
+
+                       </para>
+               </section>
+               <section id="ugr.tools.tm.language.quantifier.qr">
+                       <title>?? Question Reluctant</title>
+                       <para>
+                               The Question Reluctant quantifier matches 
optionally on an
+                               annotation if the next rule element can not 
match on the same
+                               annotation and therefore always evaluates true.
+
+                               Examples:
+                               <programlisting><![CDATA[
+                Input:    123 456 small Big small Big 
+                Rule:     SW CW?? SW
+                Matched:  small Big small
+                ]]></programlisting>
+
+                       </para>
+               </section>
+               <section id="ugr.tools.tm.language.quantifier.mmg">
+                       <title>[x,y] Min Max Greedy</title>
+                       <para>
+                               The Min Max Greedy quantifier has to match at 
least x and at most y
+                               annotations of its rule element to elaluate 
true.
+
+                               Examples:
+                               <programlisting><![CDATA[
+                Input:    123 456 small Big small Big 
+                Rule:     SW CW[1,2] SW
+                Matched:  small Big small
+                ]]></programlisting>
+
+                       </para>
+               </section>
+               <section id="ugr.tools.tm.language.quantifier.mmr">
+                       <title>[x,y]? Min Max Reluctant</title>
+                       <para>
+                               The Min Max Greedy quantifier has to match at 
least x and at most y
+                               annotations of its rule element to elaluate 
true, but stops to
+                               match
+                               on additional annotations if the next rule 
element is able to
+                               match
+                               on this annotation.
+
+                               Examples:
+                               <programlisting><![CDATA[
+                Input:    123 456 small Big Big Big small Big 
+                Rule:     SW CW[2,100]? SW
+                Matched:  small Big Big Big small
+                ]]></programlisting>
+                       </para>
+               </section>
+       </section>
+
+       
+       <xi:include xmlns:xi="http://www.w3.org/2001/XInclude"; 
href="tools.textmarker.conditions.xml"/>
+       <xi:include xmlns:xi="http://www.w3.org/2001/XInclude"; 
href="tools.textmarker.actions.xml"/>
+       
+       <section id="ugr.tools.tm.language.expressions">
+               <title>Expressions</title>
+               <para>
+               </para>
+               <section id="ugr.tools.tm.language.expressions.type">
+                       <title>Type Expressions</title>
+                       <para>
+                       </para>
+               </section>
+               <section id="ugr.tools.tm.language.expressions.numer">
+                       <title>Number Expressions</title>
+                       <para>
+                       </para>
+               </section>
+               <section id="ugr.tools.tm.language.expressions.string">
+                       <title>String Expressions</title>
+                       <para>
+                       </para>
+               </section>
+               <section id="ugr.tools.tm.language.expressions.boolean">
+                       <title>Boolean Expressions</title>
+                       <para>
+                       </para>
+               </section>
+       </section>
+       <section id="ugr.tools.tm.language.filtering">
+               <title>Robust extraction using filtering</title>
+               <para>
+                       Rule based or pattern based information extraction 
systems often
+                       suffer from unimportant fill words, additional 
whitespace and
+                       unexpected markup. The TextMarker System enables the 
knowledge
+                       engineer to filter and to hide all possible 
combinations of
+                       predefined and new types of annotations. Additionally, 
it can
+                       differentiate between every kind of HTML markup and XML 
tags. The
+                       visibility of tokens and annotations is modified by the 
actions of
+                       rule elements and can be conditioned using the complete
+                       expressiveness of the language. Therefore the 
TextMarker system
+                       supports a robust approach to information extraction 
and simplifies
+                       the creation of new rules since the knowledge engineer 
can focus on
+                       important textual features. If no rule action changed 
the
+                       configuration of the filtering settings, then the 
default filtering
+                       configuration ignores whitespaces and markup. Using the 
default
+                       setting, the following rule matches all four types of 
input in this
+                       example:
+                       <programlisting><![CDATA[
+"Dr" PERIOD CW CW
+]]></programlisting>
+                       <programlisting><![CDATA[
+Dr. Peter Steinmetz
+Dr . Peter      Steinmetz
+Dr. <b><i>Peter</i> Steinmetz</b>
+Dr.PeterSteinmetz
+]]></programlisting>
+               </para>
+       </section>
+       <section id="ugr.tools.tm.language.blocks">
+               <title>Blocks</title>
+               <para>
+                       Blocks combine some more complex control structures in 
the
+                       TextMarker
+                       language: conditioned statement, loops and procedures.
+
+
+                       The
+                       rule
+                       element
+                       in the definition of a block has to define a
+                       condition/action
+                       part,
+                       even if that part is empty (LCURLY and
+                       RCULRY).
+
+
+                       A block can use
+                       normal
+                       conditions to condition the execution
+                       of its
+                       containing rules.
+
+                       Examples:
+
+                       <programlisting><![CDATA[
+DECLARE Month;
+
+BLOCK(EnglishDates) Document{FEATURE("language", "en")} {
+    Document{->MARKFAST(Month,'englishMonthNames.txt')};
+    //...
+}
+
+BLOCK(GermanDates) Document{FEATURE("language", "de")} {
+    Document{->MARKFAST(Month,'germanMonthNames.txt')};
+    //...
+}
+]]></programlisting>
+
+
+                       A block can be used to execute the containing rule on a 
sequence of
+                       similar text passages.
+
+                       Examples:
+                       <programlisting><![CDATA[
+BLOCK(Paragraphs) Paragraphs{} { // <- limit the local view on the document: 
defines a local document
+    // This rule will be executed for each Paragraph that can be found in the 
current document.
+    Document{CONTAINS(Keyword)->MARK(SpecialParagraph)}; 
+    // Here, Document represents not the complete input document, but each 
Paragraph defined by the block statement.
+}
+]]></programlisting>
+               </para>
+       </section>
+       <section id="ugr.tools.tm.language.score">
+               <title>Heuristic extraction using scoring rules</title>
+               <para>
+                       Diagnostic scores are a well known and successfully 
applied
+                       knowledge
+                       formalization pattern for diagnostic problems. Single 
known
+                       findings
+                       valuate a possible solution by adding or subtracting 
points
+                       on an
+                       account of that solution. If the sum exceeds a given 
threshold,
+                       then
+                       the solution is derived. One of the advantages of this 
pattern
+                       is the
+                       robustness against missing or false findings, since a 
high
+                       number of
+                       findings is used to derive a solution.
+
+                       The TextMarker system
+                       tries to
+                       transfer this diagnostic problem
+                       solution
+                       strategy to the
+                       information
+                       extraction problem. In addition to a
+                       normal creation of a
+                       new
+                       annotation, a MARK action can add positive
+                       or negative scoring
+                       points
+                       to the text fragments matched by the rule
+                       elements. If the
+                       amount of
+                       points exceeds the defined threshold for
+                       the respective
+                       type, then a
+                       new annotation will be created. Further,
+                       the current
+                       value of heuristic
+                       points of a possible annotation can
+                       be
+                       evaluated by
+                       the SCORE condition.
+                       In the following, the heuristic
+                       extraction using
+                       scoring rules is
+                       demonstrated by a short example:
+
+                       <programlisting><![CDATA[
+            Paragraph{CONTAINS(W,1,5)->MARKSCORE(5,Headline)};
+            Paragraph{CONTAINS(W,6,10)->MARKSCORE(2,Headline)};
+            Paragraph{CONTAINS(Emph,80,100,true)->MARKSCORE(7,Headline)};
+            Paragraph{CONTAINS(Emph,30,80,true)->MARKSCORE(3,Headline)};
+            Paragraph{CONTAINS(CW,50,100,true)->MARKSCORE(7,Headline)};
+            Paragraph{CONTAINS(W,0,0)->MARKSCORE(-50,Headline)};
+            Headline{SCORE(10)->MARK(Realhl)};
+            Headline{SCORE(5,10)->LOG("Maybe a headline")};
+                ]]></programlisting>
+
+
+                       In the first part of this rule set, annotations of the 
type
+                       paragraph
+                       receive scoring points for a headline annotation, if 
they
+                       fulfill
+                       certain CONTAINS conditions. The first condition, for
+                       example,
+                       evaluates to true, if the paragraph contains one word 
up to
+                       five
+                       words, whereas the fourth conditions is fulfilled, if 
the
+                       paragraph
+                       contains thirty up to eighty percent of emph 
annotations.
+                       The last
+                       two
+                       rules finally execute their actions, if the score of a
+                       headline
+                       annotation exceeds ten points, or lies in the interval 
of
+                       five and
+                       ten
+                       points, respectively.
+               </para>
+       </section>
+       <section id="ugr.tools.tm.language.modification">
+               <title>Modification</title>
+               <para>
+                       There are different actions that can modify the input 
document,
+                       like DEL,
+                       COLOR and REPLACE. But the input document itself can 
not be
+                       modified
+                       directly. A seperate engine, the Modifier.xml, has to be
+                       called in
+                       order to create another cas view with the name 
"modified".
+                       In that
+                       document all modifications are executed.
+               </para>
+       </section>
+</chapter>
\ No newline at end of file

Added: 
uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/tools.textmarker.overview.xml
URL: 
http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/tools.textmarker.overview.xml?rev=1363750&view=auto
==============================================================================
--- 
uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/tools.textmarker.overview.xml
 (added)
+++ 
uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/tools.textmarker.overview.xml
 Fri Jul 20 12:27:14 2012
@@ -0,0 +1,402 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE chapter PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
+"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd";[
+<!ENTITY imgroot "images/tools/tools.textmarker/" >
+<!ENTITY % uimaents SYSTEM "../../target/docbook-shared/entities.ent" >  
+%uimaents;
+]>
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+<chapter id="ugr.tools.tm.introduction">
+    <title>TextMarker</title>
+    <para>The TextMarker system is an open source tool
+        for the development
+        of rule-based information extraction applications.
+        The development
+        environment is based on the DLTK framework. It
+        supports the knowledge
+        engineer with a full-featured rule editor,
+        components for the
+        explanation of the rule inference and a build
+        process for generic UIMA
+        Analysis Engines and Type Systems.
+        Therefore TextMarker components can
+        be easily created and combined
+        with other UIMA components in different
+        information extraction
+        pipelines rather flexibly.
+
+        TextMarker applies a
+        specialized rule representation language for the effective
+        knowledge
+        formalization:
+        The rules of the TextMarker language are composed of a
+        list of rule
+        elements that themselves consists of four parts: The
+        mandatory
+        matching condition establishes a connection to the input
+        document by
+        referring to an already existing concept, respectively
+        annotation.
+        The
+        optional quantifier defines the usage of the matching
+        condition
+        similar to regular expressions. Then, additional conditions
+        add
+        constraints to the matched text fragment and additional actions
+        determine the consequences of the rule. Therefore, TextMarker rules
+        match on a pattern of given annotations and, if the additional
+        conditions evaluate true, then they execute their actions, e.g.
+        create
+        a new annotation. If no initial annotations exist, for example,
+        created by another component, a scanner is used to seed simple token
+        annotations contained in a taxonomy.
+
+        The TextMarker system provides
+        unique functionality that is usually not
+        found in similar systems. The
+        actions are able to modify the document
+        either by replacing or
+        deleting text fragments or by filtering the
+        view on the document. In
+        this case, the rules ignore some
+        annotations,
+        e.g. HTML markup, or are
+        executed only on the remaining text passages.
+        The knowledge engineer
+        is able to add heuristic knowledge by using
+        scoring rules.
+        Additionally, several language elements common to
+        scripting languages
+        like conditioned statements, loops, procedures,
+        recursion, variables
+        and expressions increase the expressiveness of
+        the language. Rules are
+        able to directly invoke external rule sets or
+        arbitrary UIMA Analysis
+        Engines and foreign libraries can be
+        integrated with the extension
+        mechanism for new language elements.
+
+    </para>
+    <section id="ugr.tools.tm.introduction.metaphor">
+        <title>Introduction</title>
+        <para>
+            In manual information extraction humans often apply a strategy
+            according to a highlighter metaphor: First relevant headlines are
+            considered and classified according to their content by coloring
+            them
+            with different highlighters. The paragraphs of the annotated
+            headlines
+            are then considered further. Relevant text fragments or
+            single words
+            in the context of that headline can then be colored. In
+            this way, a
+            top-down analysis and extraction strategy is implemented.
+            Necessary
+            additional information can then be added that either refers
+            to other
+            text segments or contains valuable domain specific
+            information.
+            Finally the colored text can be easily analyzed
+            concerning the
+            relevant information.
+
+            The TextMarker system (textmarker
+            is a common german word for a
+            highlighter) tries to imitate this
+            manual extraction method by
+            formalizing the appropriate actions using
+            matching rules: The rules
+            mark sequences of words, extract text
+            segments or modify the input
+            document depending on textual
+            features.The default input for the
+            TextMarker system is
+            semi-structured text, but it can also process
+            structured or free
+            text. Technically, HTML is often the input
+            format,
+            since most word
+            processing documents can be converted to HTML.
+            Additionally, the
+            TextMarker systems offers the possibility to
+            create
+            a modified output
+            document.
+        </para>
+    </section>
+    <section id="ugr.tools.tm.introduction.concepts">
+        <title>Core Concepts</title>
+        <para>
+            As a first step in the extraction process the TextMarker system 
uses
+            a
+            tokenizer (scanner) to tokenize the input document and to create a
+            stream of basic symbols. The types and valid annotations of the
+            possible tokens are predefined by a taxonomy of annotation types.
+            Annotations simply refer to a section of the input document and
+            assign a type or concept to the respective text fragment. The 
figure
+            on the right shows an excerpt of a basic annotation taxonomy: CW
+            describes all tokens, for example, that contains a single word
+            starting with a capital letter, MARKUP corresponds to HTML or XML
+            tags, and PM refers to all kinds of punctuations marks. Take a look
+            at [basic annotations|BasicAnnotationList] for a complete list of
+            initial annotations.
+
+
+            <screenshot>
+                <mediaobject>
+                    <imageobject>
+                        <imagedata scale="80" format="PNG" 
fileref="&imgroot;symboltaxo.png" />
+                    </imageobject>
+                    <textobject>
+                        <phrase>Part of a taxonomy for basic annotation 
types.</phrase>
+                    </textobject>
+                </mediaobject>
+            </screenshot>
+
+            By using (and extending) the taxonomy, the knowledge engineer is
+            able
+            to choose the most adequate types and concepts when defining new
+            matching rules, i.e., TextMarker rules for matching a text fragment
+            given by a set of symbols to an annotation. If the capitalization 
of
+            a word, for example, is of no importance, then the annotation type 
W
+            that describes words of any kind can be used. The initial scanner
+            creates a set of basic annotations that may be used by the matching
+            rules of the TextMarker language. However, most information
+            extraction applications require domain specific concepts and
+            annotations. Therefore, the knowledge engineer is able to extend 
the
+            set of annotations, and to define new annotation types tuned to the
+            requirements of the given domain. These types can be flexibly
+            integrated in the taxonomy of annotation types.
+
+            One of the goals in
+            developing a new information extraction language
+            was
+            to maintain an
+            easily readable syntax while still providing a
+            scalable
+            expressiveness of the language. Basically, the TextMarker
+            language
+            contains expressions for the definition of new annotation
+            types and
+            for defining new matching rules. The rules are defined by a
+            list of
+            rule elements.
+            Each rule element contains at least a basic matching
+            condition referring
+            to text fragments or already specified
+            annotations. Additionally a
+            list of conditions and actions may be
+            specified for a rule element.
+            Whereas the conditions describe
+            necessary attributes of the matched
+            text fragment, the actions point
+            to operations and assignments on
+            the
+            current fragments. These actions
+            will then only be executed if all
+            basic conditions matched on a text
+            fragment or the annotation and the
+            related conditions are fulfilled.
+        </para>
+    </section>
+    <section id="ugr.tools.tm.introduction.examples">
+        <title>Examples</title>
+        <para>
+            The usage of the language and its readability can be demonstrated 
by
+            simple examples:
+
+            <programlisting><![CDATA[
+                CW{INLIST('animals.txt') -> MARK(Animal)};
+                Animal "and" Animal{-> MARK(Animalpair, 1, 2, 3)};
+    ]]></programlisting>
+
+            The first rule looks at all capitalized words that are listed in an
+            external document animals.txt and creates a new annotation of the
+            type
+            animal using the boundaries of the matched word. The second rule
+            searches for an annotation of the type animal followed by the
+            literal
+            and and a second animal annotation. Then it will create a new
+            annotation animalpair covering the text segment that matched the
+            three
+            rule elements (the digit parameters refer to the number of
+            matched
+            rule element).
+
+            <programlisting><![CDATA[
+                Document{-> MARKFAST(Firstname, 'firstnames.txt')};
+                Firstname CW{-> MARK(Lastname)};
+                Paragraph{VOTE(Firstname, Lastname) -> LOG("Found more 
Firstnames than Lastnames")};
+    ]]></programlisting>
+
+            In this example, the first rule annotates all words that occur in
+            the
+            external document firstnames.txt with the type firstname. The
+            second
+            rule creates a lastname annotation for all capitalized word
+            that
+            follow a firstname annotation. The last rule finally processes
+            all
+            paragraph} annotations. If the VOTE condition counts more
+            firstname
+            than lastname annotations, then the rule writes a log entry
+            with a
+            predefined message.
+
+
+            <programlisting><![CDATA[
+                ANY+{PARTOF(Paragraph), CONTAINS(Delete, 50, 100, true) -> 
MARK(Delete)};
+                Firstname{-> MARK(Delete,1 , 2)} Lastname;
+                Delete{-> DEL};
+            ]]></programlisting>
+
+            Here, the first rule looks for sequences of any kind of tokens
+            except
+            markup and creates one annotation of the type delete for each
+            sequence, if the tokens are part of a paragraph annotation and
+            contains together already more than 50% of delete annoations. The +
+            signs indicate this greedy processing. The second rule annotates
+            first
+            names followed by last names with the type delete and the third
+            rule
+            simply deletes all text segments that are associated with that
+            delete
+            annotation.
+
+        </para>
+    </section>
+    <section id="ugr.tools.tm.introduction.features">
+        <title>Special Features</title>
+        <para>
+            The TextMarker language features some special characteristics
+            that are
+            usually not found in other rule-based information extraction
+            systems
+            or even shift it towards scripting languages. The possibility
+            of
+            creating new annotation types and integrating them into the
+            taxonomy
+            facilitates an even more modular development of information
+            extraction systems.
+
+            Read more about robust extraction using
+            filtering, complex control
+            structures and heuristic extraction using
+            scoring rules.
+        </para>
+    </section>
+    <section id="ugr.tools.tm.introduction.getstarted">
+        <title>Get started</title>
+        <para>
+            This section page gives you a short, technical introduction on
+            how to
+            get
+            started with TextMarker system and mostly just links the
+            information
+            of the other wiki pages. Some knowledge about the usage
+            of Eclipse and
+            central concepts of UIMA are useful. TextMarker
+            consists of the
+            TextMarker rule language (and of course the rule
+            inference) and the
+            TextMarker workbench. Additionally, the CEV plugin
+            is used to edit
+            and
+            visualize annotated text. The TextRuler system
+            with implementations of
+            well known rule learning methods and
+            development extension with
+            support for test-driven development are
+            already integrated.
+        </para>
+        <section id="ugr.tools.tm.introduction.getstarted.running">
+            <title>Up and running</title>
+            <para>
+                First of all, install the Workbench and read the introduction
+                and its
+                examples. In order to verify if the Workbench is correctly
+                installed,
+                take a look at Help-About Eclipse-Installation Details
+                and compare
+                the installed plugins with the plugins you copied into
+                the plugins
+                folder of your Eclipse application. Normally most of the
+                plugins do
+                not cause any troubles, but the CEV does because of the
+                XPCom and
+                XULRunner dependencies. You should at least get the XPCom
+                plugin up
+                and running. However, you cannot use the additional HTML
+                functionality without the XULRunner plugin. If the plugins of 
the
+                installation guide do not work properly and a google search 
for a
+                suiteable plugin is not successful, then write a mail to the 
user
+                list and we will try to solve the problem. If all plugins are
+                correctly installed, then start the Eclipse application and 
switch
+                to
+                the TextMarker perspective (Window-Open Perspective-Other...)
+            </para>
+        </section>
+        <section id="ugr.tools.tm.introduction.getstarted.example">
+            <title>Learn by example</title>
+            <para>
+                Having a running Workbench download the example project and
+                import/copy
+                this TextMarker project into your workspace. The project
+                contains
+                some simple rules for extraction the author, title and year
+                of
+                reference strings. Next, take a look at the project structure 
and
+                the
+                syntax and compare it with the example project and its 
contents.
+                Open
+                the Main.tm TextMarker script in the folder
+                script/de.uniwue.example
+                and press the Run button in the Eclipse
+                toolbar. The docments in
+                the
+                input folder will then be processed by
+                the Main.tm file and the
+                result of the information extraction task
+                is placed in the output
+                folder. As you can see, there are four
+                files: an xmiCAS for each
+                input file and a HTML file (the
+                modifed/colored result). Open one of
+                the .xmi files with the CAS
+                Editor plugin (-popup menu-Open with) and
+                select some checkboxes in
+                the Annotation Browser view.
+            </para>
+        </section>
+        <section id="ugr.tools.tm.introduction.getstarted.doit">
+            <title>Do it yourself</title>
+            <para>
+                Try to write some rules yourself. Read the description of the
+                available
+                language constructs, e.g., conditions and actions and use
+                the
+                explanation component in order to take a closer look at the 
rule
+                inference. Then finally, read the rest of this document.
+            </para>
+        </section>
+    </section>
+</chapter>
\ No newline at end of file


Reply via email to