[MediaWiki-commits] [Gerrit] mediawiki...Wikispeech[master]: Map tokens from TTS responses to HTML

Sebastian Berlin (WMSE) (Code Review) Wed, 05 Oct 2016 01:42:17 -0700

Sebastian Berlin (WMSE) has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/314237


Change subject: Map tokens from TTS responses to HTML
......................................................................

Map tokens from TTS responses to HTML

Added mapping between the tokens received from the TTS server to the "words"
in the html. Tokens are stored in the utterance elements and are assigned a
position attribute, which is the index of the start of the corresponding
html substring. Removed HTML tags are stored in the tokens element.

Bug: T140105
Change-Id: Ie784328fa3d7bcf7941b6b89146687272fe3b0ca
---
M Hooks.php
A includes/CleanedTag.php
M includes/Cleaner.php
M includes/HtmlGenerator.php
M includes/Segmenter.php
M modules/ext.wikispeech.js
M tests/phpunit/CleanerTest.php
M tests/phpunit/HtmlGeneratorTest.php
M tests/phpunit/SegmenterTest.php
A tests/phpunit/Util.php
M tests/qunit/ext.wikispeech.test.js
11 files changed, 1,303 insertions(+), 271 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Wikispeech 
refs/changes/37/314237/1

diff --git a/Hooks.php b/Hooks.php
index 33e6c1d..a7c3211 100644
--- a/Hooks.php
+++ b/Hooks.php
@@ -18,7 +18,7 @@
         *
         * @param array $testModules The array of registered test modules
         * @param ResourceLoader $resourceLoader The reference to the resource
-        *      loader
+        *  loader
         * @return true
         */
 
@@ -46,7 +46,7 @@
         * namespace.
         *
         * @param $parser Parser object. Can be used to manually parse a portion
-        *      of wiki text from the $text.
+        *  of wiki text from the $text.
         * @param $text Represents the text for page.
         */
 
@@ -59,14 +59,17 @@
                                'HTML from onParserAfterTidy(): ' . $text
                        );
                        $cleanedText = Cleaner::cleanHtml( $text );
-                       wfDebugLog( 'Wikispeech', 'Cleaned text: ' . 
$cleanedText );
+                       wfDebugLog(
+                               'Wikispeech',
+                               'Cleaned text: ' . var_export( $cleanedText, 
true )
+                       );
                        $utterances = Segmenter::segmentSentences( $cleanedText 
);
                        wfDebugLog(
                                'Wikispeech',
                                'Utterances: ' . var_export( $utterances, true )
                        );
                        $utterancesHtml =
-                               HtmlGenerator::generateUtterancesHtml( 
$utterances );
+                               HtmlGenerator::createUtterancesHtml( 
$utterances );
                        wfDebugLog(
                                'Wikispeech',
                                'Adding utterances HTML: ' . $utterancesHtml
@@ -101,7 +104,7 @@
         *
         * @param OutputPage $out The OutputPage object.
         * @param Skin $skin Skin object that will be used to generate the page,
-        *      added in 1.13.
+        *  added in 1.13.
         */
 
        public static function onBeforePageDisplay(
diff --git a/includes/CleanedTag.php b/includes/CleanedTag.php
new file mode 100644
index 0000000..a5f112c
--- /dev/null
+++ b/includes/CleanedTag.php
@@ -0,0 +1,76 @@
+<?php
+
+/**
+ * @file
+ * @ingroup Extensions
+ * @license GPL-2.0+
+ */
+
+abstract class CleanedTag {
+
+       /**
+        * @var string $tagString The string representation of the tag, as it is
+        * written in the HTML. This includes the tag name, any attributes, and
+        * the brackets.
+        */
+
+       public $tagString;
+
+       function __construct( $tagString ) {
+               $this->tagString = $tagString;
+       }
+
+       /**
+        * Get the length of the tag string.
+        *
+        * @since 0.0.1
+        * @return int The length of the tag string.
+        */
+
+       function getLength() {
+               return strlen( $this->tagString );
+       }
+}
+
+class CleanedStartTag extends CleanedTag {
+
+       /**
+        * @var int $contentLength The length of the element content, i.e. the
+        * string delimited by this start tag and the corresponding end tag.
+        */
+
+       public $contentLength;
+
+       /**
+        * @var bool $removed Whether the tag was completely removed.
+        */
+
+       public $removed;
+
+       function __construct( $tagString ) {
+               parent::__construct( $tagString );
+               $this->contentLength = 0;
+               $this->removed = false;
+       }
+
+       /**
+        * Get the length of the tag string.
+        *
+        * @since 0.0.1
+        * @return int The length of the tag string, including element content.
+        */
+
+       function getLength() {
+               $length = strlen( $this->tagString );
+               if ( $this->removed ) {
+                       $length += $this->contentLength;
+               }
+               return $length;
+       }
+}
+
+class CleanedEndTag extends CleanedTag {
+}
+
+class CleanedEmptyTag extends CleanedTag {
+}
diff --git a/includes/Cleaner.php b/includes/Cleaner.php
index 9dcc7e6..5fd5917 100644
--- a/includes/Cleaner.php
+++ b/includes/Cleaner.php
@@ -6,19 +6,49 @@
  * @license GPL-2.0+
  */
 
+require_once 'CleanedTag.php';
+
 class Cleaner {
 
        /**
-        * Clean HTML tags by removing some altogether and keeping content
-        * for some.
+        * Clean HTML tags from a string.
+        *
+        * Separates any HTML tags from the text.
         *
         * @since 0.0.1
         * @param string $markedUpText Input text that may contain HTML tags.
-        * @return string The text with HTML tags removed/replaced with
-        * contents.
+        * @return array An array of nodes where tags are stored as CleanedTags
+        *  and text nodes as strings.
         */
 
        public static function cleanHtml( $markedUpText ) {
+               $dom = self::createDomDocument( $markedUpText );
+               $tags = self::getTags( $markedUpText );
+               // Start adding the nodes that are children of the dummy 
element. To
+               // not add the actual dummy tags, index starts on -1.
+               $tagIndex = -1;
+               $cleanedContent = [];
+               self::addContent(
+                       $cleanedContent,
+                       $dom->documentElement->firstChild,
+                       $markedUpText,
+                       $tags,
+                       $tagIndex
+               );
+               return $cleanedContent;
+       }
+
+       /**
+        * Create a DOMDocument from an HTML string.
+        *
+        * A dummy element is added as top node.
+        *
+        * @since 0.0.1
+        * @param string $markedUpString The string to create the DOMDocument.
+        * @return DOMDocument The created DOMDocument.
+        */
+
+       private static function createDomDocument( $markedUpText ) {
                $dom = new DOMDocument();
                // Add encoding information and wrap the input text in a dummy 
tag
                // to prevent p tags from being added for text nodes.
@@ -30,48 +60,249 @@
                        $wrappedText,
                        LIBXML_HTML_NODEFDTD | LIBXML_HTML_NOIMPLIED
                );
-               $cleanedText = self::getTextContent( $dom->documentElement );
-               return $cleanedText;
+               return $dom;
        }
 
        /**
-        * Recursively get the text from a node and its children.
+        * Extract a list of tags from a string.
+        *
+        * Tags are extracted as strings, in the order they appear. In cases
+        * where there are both start and end tag, these are stored together in 
an
+        * array.
         *
         * @since 0.0.1
-        * @param DOMNode $node The top node to get text from.
-        * @return string The cleaned text from the nodes.
+        * @param string $markedUpText The string to extract tags from.
+        * @return array An array containing the found tags.
         */
 
-       private static function getTextContent( $node ) {
-               $content = '';
-               if ( !self::matchesRemove( $node ) ) {
-                       foreach ( $node->childNodes as $child ) {
-                               if ( $child->nodeType == XML_TEXT_NODE ) {
-                                       $content .= $child->textContent;
-                               } else {
-                                       $content .= self::getTextContent( 
$child );
+       private static function getTags( $markedUpText ) {
+               $potentialTagBrackets = [];
+               preg_match_all(
+                       '/[<>]/',
+                       $markedUpText,
+                       $potentialTagBrackets,
+                       PREG_SET_ORDER | PREG_OFFSET_CAPTURE
+               );
+               $tags = [];
+               $startBracket = null;
+               foreach ( $potentialTagBrackets as $match ) {
+                       // $match[0] is an array containing the matched string 
and it's
+                       // position.
+                       $bracketString = $match[0][0];
+                       if ( $bracketString == '<' ) {
+                               if ( $startBracket == null ) {
+                                       $startBracket = $match[0];
+                               }
+                       } elseif ( $bracketString == '>' ) {
+                               $tagString = substr(
+                                       $markedUpText,
+                                       $startBracket[1],
+                                       $match[0][1] - $startBracket[1] + 1
+                               );
+                               $bracketPosition = $startBracket[1];
+                               $startBracket = null;
+                               if ( self::isStartTag( $tagString ) ) {
+                                       array_push( $tags, [ [ $tagString, 
$bracketPosition ] ] );
+                               } elseif ( self::isEndTag( $tagString ) ) {
+                                       $startTagIndex = 
self::getCorrespondingStartTagIndex(
+                                               $tags,
+                                               $tagString
+                                       );
+                                       // Add the end tag to the array already 
containing the
+                                       // start tag.
+                                       array_push(
+                                               $tags[$startTagIndex],
+                                               [ $tagString, $bracketPosition ]
+                                       );
+                               } elseif ( self::isEmptyTag( $tagString ) ) {
+                                       array_push( $tags, $tagString );
                                }
                        }
                }
-               return $content;
+               return $tags;
        }
 
        /**
-        * Check if a tag matches criteria for removal.
-        *
-        * The criteria are defined by $wgWikispeechRemoveTags, which is a map
-        * where the keys are tag names. If the value is true, the tag will be
-        * removed. If the value is an array, it defines further criteria,
-        * currently only class name, which needs to match for the tag to be
-        * removed.
-        *
-        * The value may be false, which means the tag won't be removed. This 
is to
-        * allow overriding default values in LocalSettings.php, but is 
otherwise
-        * not required.
+        * Test if a string is an start tag.
         *
         * @since 0.0.1
-        * @param DOMNode $node The node for the tag to check.
-        * @return bool true if the tag match removal criteria, otherwise false.
+        * @param $tagString The string to test.
+        * @return true if $tagString is a start tag, else false.
+        */
+
+       private static function isStartTag( $tagString ) {
+               return !preg_match( '!^</!', $tagString ) &&
+                       !preg_match( '!/>$!', $tagString );
+       }
+
+       /**
+        * Test if a string i an end tag.
+        *
+        * @since 0.0.1
+        * @param $tagString The string to test.
+        * @return true if $tagString is an end tag, else false.
+        */
+
+       private static function isEndTag( $tagString ) {
+               return preg_match( '!^</!', $tagString );
+       }
+
+       /**
+        * Test if a string i an empty tag.
+        *
+        * @since 0.0.1
+        * @param $tagString The string to test.
+        * @return true if $tagString is an empty tag, else false.
+        */
+
+       private static function isEmptyTag( $tagString ) {
+               return preg_match( '!/>$!', $tagString );
+       }
+
+       /**
+        * Get the index in $tags of the tag that starts the element which ends
+        * with $tagString.
+        *
+        * Traverses $tags backwards and tests if start tags are of the same 
type
+        * as the one in $tagString.
+        *
+        * @since 0.0.1
+        * @param array $tags Tag array, as returned from getTags().
+        * @param string $tagString the end tag to find start tag for, as HTML
+        *  string.
+        * @return int The index in $tags of the start tag found.
+        */
+
+       private static function getCorrespondingStartTagIndex( $tags, 
$tagString ) {
+               for ( $i = count( $tags ) - 1; $i >= 0; $i -- ) {
+                       $tag = $tags[$i];
+                       // Make sure the tag to test is an array, i.e. a start 
tag.
+                       if ( is_array( $tag ) ) {
+                               $startTagType = self::getTagName( $tag[0][0] );
+                               $endTagType = self::getTagName( $tagString );
+                               if ( $startTagType == $endTagType ) {
+                                       return $i;
+                               }
+                       }
+               }
+       }
+
+       /**
+        * Get the tag name from a tag string.
+        *
+        * @since 0.0.1
+        * @param string $tagString The tag as string.
+        * @return string The name of the tag in $tagString.
+        */
+
+       private static function getTagName( $tagString ) {
+               $nameMatch = null;
+               preg_match( '!</?([^ />]+)( />)?!', $tagString, $nameMatch );
+               $tagName = $nameMatch[1];
+               return $tagName;
+       }
+
+       /**
+        * Recursively add content as either CleanedTags or strings.
+        *
+        * Goes through all the child nodes of $node and add the corresponding
+        * content. If a child is a tag, it's added as a CleanedTag of the
+        * appropriate type (Start, End or Empty). If a child is a text node, 
the
+        * text is added as a string.
+        *
+        * @since 0.0.1
+        * @param array $content The resulting array of CleanedTags and strings.
+        * @param array $tags Tag array, as generated by getTags().
+        * @param DOMNode $node The top node to add from.
+        * @param string $source The HTML string that DOM is generated from. 
Used
+        *  for retrieveing element contents.
+        * @param int $tagIndex The index of the next tag, from $tags.
+        * @param bool $parentRemoved Whether the parent node of $node was 
removed.
+        *  This is used to prevent content under removed nodes to be added.
+        */
+
+       private static function addContent(
+               &$content,
+               $node,
+               $source,
+               $tags,
+               &$tagIndex,
+               $parentRemoved=false
+       ) {
+               // Save the current tag index to find the correct end tag, since
+               // $tagIndex will change if there are child nodes.
+               $thisTagIndex = $tagIndex;
+               // Don't add the dummy tag.
+               if ( $thisTagIndex >= 0 ) {
+                       $tag = $tags[$thisTagIndex];
+                       if ( is_array( $tag ) ) {
+                               $tagString = $tag[0][0];
+                               $cleanedStartTag = new CleanedStartTag( 
$tagString );
+                               if ( self::matchesRemove( $node ) ) {
+                                       $cleanedStartTag->removed = true;
+                               }
+                               if ( !$parentRemoved ) {
+                                       array_push( $content, $cleanedStartTag 
);
+                               }
+                               $elementContentStartPosition =
+                                       $tag[0][1] + strlen( $tagString );
+                       } else {
+                               // If the tag is empty, just add it and return, 
since there
+                               // can't any child nodes.
+                               $cleanedTag = new CleanedEmptyTag( $tag );
+                               if ( !$parentRemoved ) {
+                                       array_push( $content, $cleanedTag );
+                               }
+                               return;
+                       }
+               }
+               foreach ( $node->childNodes as $child ) {
+                       if ( $child->nodeType == XML_TEXT_NODE ) {
+                               if ( !self::matchesRemove( $node ) && 
!$parentRemoved ) {
+                                       // Only keep text nodes if they aren't 
children of a
+                                       // removed tag.
+                                       array_push( $content, 
$child->textContent );
+                               }
+                       } else {
+                               // Nodes are handled even if their parents are 
removed, to not
+                               // get the DOM nodes out of sync with $tags.
+                               $tagIndex += 1;
+                               self::addContent(
+                                       $content,
+                                       $child,
+                                       $source,
+                                       $tags,
+                                       $tagIndex,
+                                       self::matchesRemove( $node )
+                               );
+                       }
+               }
+               if (
+                       $thisTagIndex >= 0 &&
+                       is_array( $tags[$thisTagIndex] ) &&
+                       !$parentRemoved
+               ) {
+                       $endTag = $tags[$thisTagIndex][1];
+                       $tagString = $endTag[0];
+                       $length = $endTag[1] - $elementContentStartPosition;
+                       // The element content is the string between the end of 
the
+                       // start tag and the start of the end tag.
+                       $elementContentString =
+                               substr( $source, $elementContentStartPosition, 
$length );
+                       $cleanedStartTag->contentLength = strlen( 
$elementContentString );
+                       array_push( $content, new CleanedEndTag( $tagString ) );
+               }
+       }
+
+       /**
+        * Check if a node matches criteria for removal.
+        *
+        * The node is compared to the removal criteria from the configuration, 
to
+        * determine if it should be removed completely.
+        *
+        * @since 0.0.1
+        * @param DOMNode $node The node to check.
+        * @return bool true if the node match removal criteria, otherwise 
false.
         */
 
        private static function matchesRemove( $node ) {
@@ -80,12 +311,12 @@
                        // The node name isn't found in the removal list.
                        return false;
                }
-               $removeCriteria = $wgWikispeechRemoveTags[ $node->nodeName ];
+               $removeCriteria = $wgWikispeechRemoveTags[$node->nodeName];
                if ( $removeCriteria === true ) {
                        // Node name is found and there are no extra criteria.
                        return true;
                }
-               if ( self::nodeHasClass( $node, $removeCriteria[ 'class' ] ) ) {
+               if ( self::nodeHasClass( $node, $removeCriteria['class'] ) ) {
                        // Node name and class name match.
                        return true;
                }
diff --git a/includes/HtmlGenerator.php b/includes/HtmlGenerator.php
index 6d94604..7f77e30 100644
--- a/includes/HtmlGenerator.php
+++ b/includes/HtmlGenerator.php
@@ -6,35 +6,33 @@
  * @license GPL-2.0+
  */
 
+require_once 'CleanedTag.php';
+
 class HtmlGenerator {
 
        /**
-        * Generate an HTML string for a sequence of utternaces. Utterance tags
-        * look like this:
-        * <utterance id="utterance-0><text>Utterance 
string.</text><audio></audio></utterance>
-        * The <text> and <audio> tags are used to request audio from the TTS
-        * server and store the response.
+        * Create an HTML string for a sequence of utternaces.
         *
         * @since 0.0.1
-        * @param array $utterances The utterance strings to generate HTML from.
+        * @param array $segments Array of segments to generate utterances from.
         * @return string An HTML string containing the <utterance> tags, 
wrapped
         *      in an <utterances> tag.
         */
 
-       public static function generateUtterancesHtml( $utterances ) {
-               if ( count( $utterances ) ) {
+       public static function createUtterancesHtml( $segments ) {
+               if ( count( $segments ) ) {
                        $dom = new DOMDocument();
                        $utterancesNode = $dom->createElement( 'utterances' );
                        // Hide the content of the utterance elements.
                        $utterancesNode->setAttribute( 'hidden', '' );
                        $index = 0;
-                       foreach ( $utterances as $utteranceString ) {
-                               $utteranceNode = self::generateUtteranceElement(
+                       foreach ( $segments as $segment ) {
+                               $utteranceElement = 
self::createUtteranceElement(
                                        $dom,
-                                       $utteranceString,
+                                       $segment,
                                        $index
                                );
-                               $utterancesNode->appendChild( $utteranceNode );
+                               $utterancesNode->appendChild( $utteranceElement 
);
                                $index += 1;
                        }
                        $utternacesHtml = urldecode( $dom->saveHTML( 
$utterancesNode ) );
@@ -42,35 +40,76 @@
                }
        }
 
+       // @codingStandardsIgnoreStart
        /**
-        * Create an utterance element, which has child elements for the 
utterance
-        * string and audio.
+        * Create an utterance element.
+        *
+        * The element looks like this in HTML:
+        * <utterance id="utterance-0><nodes>Utterance string with 
<cleaned-tag>not removed tag</cleaned-tag>.</nodes></utterance>
+        *
+        * The id is a zero based index, used to find the adjacent utterances, 
when
+        * next or previous utterance should be played.
+        *
+        * The nodes element contains a representation of the HTML that were 
used
+        * to generate this utterance. Text nodes are the same as in the 
original
+        * HTML. Elements are represented by cleaned-tag elements, whose
+        * contents are the tags from the original HTML, excluding < and >.
         *
         * @since 0.0.1
         * @param DOMDocument $dom The DOMDocument to use for creating the
-        *      elements.
-        * @param string $utteranceString The string to add to the text element,
-        *      which is later sent to the TTS server.
+        *      element.
+        * @param array $segment An array with position and content as an array 
of
+        *      CleanedTags and strings.
         * @param int $index The index of the element, used for giving it an id.
         *      Later used for playing the utterances in the correct order.
         * @return DOMElement The resulting utterance element.
         */
+       // @codingStandardsIgnoreEnd
 
-       private static function generateUtteranceElement(
-               $dom,
-               $utteranceString,
-               $index
-       ) {
+       private static function createUtteranceElement( $dom, $segment, $index 
) {
                $utteranceElement = $dom->createElement( 'utterance' );
                $utteranceElement->setAttribute( 'id', "utterance-$index" );
+               $utteranceElement->setAttribute(
+                       'position',
+                       $segment['position']
+               );
+               $content = self::getSegmentContentHtml( $segment['content'] );
                $textNode = $dom->createElement(
-                       'text',
-                       // URL encoding (and later decoding) if required due to
+                       'nodes',
+                       // URL encoding (and later decoding) is required due to
                        // strings containing # not being written otherwise.
-                       urlencode( $utteranceString ) );
+                       urlencode( $content ) );
                $utteranceElement->appendChild( $textNode );
-               $audioNode = $dom->createElement( 'audio' );
-               $utteranceElement->appendChild( $audioNode );
                return $utteranceElement;
        }
+
+       /**
+        * Get the content of a segment, as HTML.
+        *
+        * CleanedTags are represented as cleaned-tag elements.
+        *
+        * @since 0.0.1
+        * @param array $nodes An array of CleanedTags and strings.
+        * @return string An HTML representation of $nodes.
+        */
+
+       private static function getSegmentContentHtml( $nodes ) {
+               $content = '';
+               foreach ( $nodes as $node ) {
+                       if ( $node instanceof CleanedTag ) {
+                               // Remove the < and > from the tag string to 
not have to
+                               // decode them later.
+                               $text = substr( $node->tagString, 1, -1 );
+                               $dom = new DOMDocument();
+                               $tagNode = $dom->createElement( 'cleaned-tag', 
$text );
+                               if ( $node instanceof CleanedStartTag && 
$node->removed ) {
+                                       $tagNode->setAttribute( 'removed', 
$node->contentLength );
+                               }
+                               $content .= $dom->saveXML( $tagNode );
+                       } else {
+                               $content .= $node;
+                       }
+               }
+               return $content;
+       }
 }
diff --git a/includes/Segmenter.php b/includes/Segmenter.php
index f9af191..550294e 100644
--- a/includes/Segmenter.php
+++ b/includes/Segmenter.php
@@ -6,49 +6,112 @@
  * @license GPL-2.0+
  */
 
+require_once 'CleanedTag.php';
+
 class Segmenter {
 
        /**
-        * Divide a string into segments, where each segment is a sentence. A
-        * sentence is here defined as a number of tokens ending with a dot 
(full
-        * stop) or a newline. Headings are also considered sentences.
+        * Divide a cleaned content array into segments, one for each sentence.
+        *
+        * A segment is an array with the keys "content" and "position". 
Content is
+        * an array of CleanedTags and strings. Position is the start
+        * position, in the HTML, for the first node in content, i.e. the start
+        * position of the segment.
+        *
+        * A sentence is here defined as a number of tokens ending with a dot 
(full
+        * stop). Headings are also considered sentences.
         *
         * @since 0.0.1
-        * @param string $text A string to segment.
-        * @return array The segments found.
+        * @param array $cleanedContent An array of cleaned content, as 
returned by
+        *  Cleaner::cleanHtml().
+        * @return array An array of segments, each containing the nodes in that
+        *  segment and the start position in the HTML.
         */
 
-       public static function segmentSentences( $text ) {
-               $matches = [];
-               // Find the indices of all characters that may be sentence 
final.
-               preg_match_all(
-                       "/(.|\n)/",
-                       $text,
-                       $matches,
-                       PREG_OFFSET_CAPTURE );
-               $start = 0;
+       public static function segmentSentences( $cleanedContent ) {
                $segments = [];
-               foreach ( $matches[ 0 ] as $match ) {
-                       $index = $match[ 1 ];
-                       if ( self::isSentenceFinal( $text, $index ) ) {
-                               $length = $index - $start + 1;
-                               $segment = trim( substr( $text, $start, $length 
) );
-                               if ( $segment != '' ) {
-                                       // Strings that are only whitespaces 
are not considered
-                                       // sentences.
-                                       array_push( $segments, $segment );
-                                       // Start the next sentence after the 
sentence final
-                                       // character.
-                                       $start = $index + 1;
-                               }
+               $currentSegment = [
+                       'position' => 0,
+                       'content' => []
+               ];
+               foreach ( $cleanedContent as $content ) {
+                       if ( $content instanceof CleanedTag ) {
+                               // Non-text nodes are always added to the 
current segment, as
+                               // they can't contain segment breaks.
+                               array_push( $currentSegment['content'], 
$content );
+                       } else {
+                               self::addSegments(
+                                       $segments,
+                                       $currentSegment,
+                                       $content
+                               );
                        }
+               }
+               if ( $currentSegment['content'] ) {
+                       // Add the last segment, unless it's empty.
+                       array_push( $segments, $currentSegment );
                }
                return $segments;
        }
 
        /**
-        * Tests if a character is at the end of a sentence. Dots in 
abbreviations
-        * should only be counted when they also are sentence final. For 
example:
+        * Add segments for a string.
+        *
+        * Finds segments, or parts thereof, by sentence final strings and adds
+        * them as segments. The first string segment is added to the current
+        * segment, if any. Subsequent string segments are added as is.
+        *
+        * @since 0.0.1
+        * @param array $segments The segments array to add new segments to.
+        * @param array $currentSegment The segment under construction, to which
+        *  the first found string segment will be added.
+        * @param string $text The string to segment.
+        */
+
+       private static function addSegments(
+               &$segments,
+               &$currentSegment,
+               $text
+       ) {
+               // Find the indices of all characters that may be sentence 
final.
+               preg_match_all(
+                       "/\./",
+                       $text,
+                       $matches,
+                       PREG_OFFSET_CAPTURE
+               );
+               $position = 0;
+               foreach ( $matches[0] as $match ) {
+                       $sentenceFinalPosition = $match[1];
+                       if ( self::isSentenceFinal( $text, 
$sentenceFinalPosition ) ) {
+                               $length = $sentenceFinalPosition - $position + 
1;
+                               $segmentText = substr( $text, $position, 
$length );
+                               if ( trim( $segmentText ) != '' ) {
+                                       // Don't add segments with only 
whitespaces.
+                                       array_push( $currentSegment['content'], 
$segmentText );
+                                       $position = $sentenceFinalPosition + 1;
+                                       array_push( $segments, $currentSegment 
);
+                                       $nextSegmentPosition =
+                                               self::getSegmentLength( 
$currentSegment['content'] );
+                                       $currentSegment = [
+                                               'position' => 
$nextSegmentPosition,
+                                               'content' => []
+                                       ];
+                               }
+                       }
+               }
+               $remainder = substr( $text, $position );
+               if ( $remainder ) {
+                       // Add any remaining part of the string.
+                       array_push( $currentSegment['content'], $remainder );
+               }
+       }
+
+       /**
+        * Test if a character is at the end of a sentence.
+        *
+        * Dots in abbreviations should only be counted when they also are 
sentence
+        * final. For example:
         * "Monkeys, penguins etc.", but not "Monkeys e.g. baboons".
         *
         * @since 0.0.1
@@ -58,26 +121,23 @@
         */
 
        private static function isSentenceFinal( $string, $index ) {
-               $character = $string[ $index ];
+               $character = $string[$index];
                $nextCharacter = null;
                if ( strlen( $string ) > $index + 1 ) {
-                       $nextCharacter = $string[ $index + 1 ];
+                       $nextCharacter = $string[$index + 1];
                }
                $characterAfterNext = null;
                if ( strlen( $string ) > $index + 2 ) {
-                       $characterAfterNext = $string[ $index + 2 ];
+                       $characterAfterNext = $string[$index + 2];
                }
-               if ( $character == "\n" ) {
-                       // A newline is always sentence final.
-                       return true;
-               } elseif (
+               if (
                        $character == '.' &&
-                       $nextCharacter == ' ' && self::isUpper( 
$characterAfterNext ) ||
-                       $nextCharacter == "\n" ||
-                       $nextCharacter == ''
+                       ( $nextCharacter == ' ' && self::isUpper( 
$characterAfterNext ) ||
+                       $nextCharacter == '' ||
+                       $nextCharacter == "\n" )
                ) {
                        // A dot is sentence final if it's followed by a space 
and a
-                       // capital letter, at the end of line or at the end of 
string.
+                       // capital letter or at the end of string or line.
                        return true;
                } else {
                        return false;
@@ -85,11 +145,11 @@
        }
 
        /**
-        * Tests if a string is upper case.
+        * Test if a string is upper case.
         *
         * @since 0.0.1
         * @param string $string The string to test.
-        * @return bool True if the entire string is upper case, else false.
+        * @return bool true if the entire string is upper case, else false.
         */
 
        private static function isUpper( $string ) {
@@ -97,21 +157,22 @@
        }
 
        /**
-        * Split a string by newline.
+        * Calculate the length of a segment, as it is represented in HTML.
         *
         * @since 0.0.1
-        * @param string $text A string to segment.
-        * @return array The segments found. Segments only containing 
whitespaces
-        * are discarded.
+        * @param array $segment An array of nodes.
+        * @return int The combinded length of the HTML of the nodes in 
$segment.
         */
 
-       public static function segmentParagraphs( $text ) {
-               $segments = [];
-               foreach ( explode( "\n", $text ) as $segment ) {
-                       if ( strlen( trim( $segment ) ) > 0 ) {
-                               array_push( $segments, $segment );
+       private static function getSegmentLength( $segment ) {
+               $length = 0;
+               foreach ( $segment as $content ) {
+                       if ( $content instanceof CleanedTag ) {
+                               $length += $content->getLength();
+                       } else {
+                               $length += strlen( $content );
                        }
                }
-               return $segments;
+               return $length;
        }
 }
diff --git a/modules/ext.wikispeech.js b/modules/ext.wikispeech.js
index a416bf5..4872ae4 100644
--- a/modules/ext.wikispeech.js
+++ b/modules/ext.wikispeech.js
@@ -14,9 +14,7 @@
                 */
 
                this.addPlayStopButton = function () {
-                       var $playStopButton;
-
-                       $playStopButton = $( '<button></button>' )
+                       var $playStopButton = $( '<button></button>' )
                                .attr( 'id', 'ext-wikispeech-play-stop-button' )
                                .addClass( 'ext-wikispeech-play' );
                        $( '#firstHeading' ).append( $playStopButton );
@@ -42,10 +40,8 @@
                 */
 
                this.play = function () {
-                       var $playStopButton;
-
+                       var $playStopButton = $( 
'#ext-wikispeech-play-stop-button' );
                        self.playUtterance( $( '#utterance-0' ) );
-                       $playStopButton = $( '#ext-wikispeech-play-stop-button' 
);
                        $playStopButton.removeClass( 'ext-wikispeech-play' );
                        $playStopButton.addClass( 'ext-wikispeech-stop' );
                };
@@ -81,11 +77,9 @@
                 */
 
                this.stop = function () {
-                       var $playStopButton;
-
+                       var $playStopButton = $( 
'#ext-wikispeech-play-stop-button' );
                        self.stopUtterance( $currentUtterance );
                        $currentUtterance = $();
-                       $playStopButton = $( '#ext-wikispeech-play-stop-button' 
);
                        $playStopButton.removeClass( 'ext-wikispeech-stop' );
                        $playStopButton.addClass( 'ext-wikispeech-play' );
                };
@@ -98,9 +92,7 @@
                 */
 
                this.addSkipAheadSentenceButton = function () {
-                       var $skipAheadSentenceButton;
-
-                       $skipAheadSentenceButton = $( '<button></button>' )
+                       var $skipAheadSentenceButton = $( '<button></button>' )
                                .attr( 'id', 
'ext-wikispeech-skip-ahead-sentence-button' )
                                .addClass( 'ext-wikispeech-skip-ahead-sentence' 
);
                        $( '#firstHeading' ).append( $skipAheadSentenceButton );
@@ -116,9 +108,7 @@
                 */
 
                this.skipAheadUtterance = function () {
-                       var $nextUtterance;
-
-                       $nextUtterance = self.getNextUtterance( 
$currentUtterance );
+                       var $nextUtterance = self.getNextUtterance( 
$currentUtterance );
                        if ( $nextUtterance.length ) {
                                self.playUtterance( $nextUtterance );
                        } else {
@@ -131,14 +121,11 @@
                 */
 
                this.addKeyboardShortcuts = function () {
-                       var shortcuts;
-
+                       var shortcuts = mw.config.get( 
'wgWikispeechKeyboardShortcuts' );
                        $( document ).keydown( function ( event ) {
-                               shortcuts = mw.config.get( 
'wgWikispeechKeyboardShortcuts' );
                                if ( self.eventMatchShortcut( event, 
shortcuts.playStop ) ) {
                                        self.playOrStop();
-                               }
-                               if ( self.eventMatchShortcut(
+                               } else if ( self.eventMatchShortcut(
                                        event,
                                        shortcuts.skipAheadUtterance )
                                ) {
@@ -212,7 +199,7 @@
                 *
                 * @param $utterance The original utterance.
                 * @return The utterance after the original utterance. Empty 
object if
-                *      $utterance isn't a valid utterance.
+                *  $utterance isn't a valid utterance.
                 */
 
                this.getNextUtterance = function ( $utterance ) {
@@ -233,8 +220,7 @@
                /**
                 * Request audio for an utterance.
                 *
-                * When the response is received, set the audio URL as the 
source for
-                * the utterance's audio element.
+                * Adds audio and token elements when the response is received.
                 *
                 * @param $utterance The utterance to load audio for.
                 */
@@ -242,14 +228,21 @@
                this.loadAudio = function ( $utterance ) {
                        var $audio, text, audioUrl;
 
-                       $audio = $utterance.children( 'audio' );
+                       $audio = $( '<audio></audio>' ).appendTo( $utterance );
                        mw.log( 'Loading audio for: ' + $utterance.attr( 'id' ) 
);
-                       text = $utterance.children( 'text' ).text();
+                       // Get the combined string of the text nodes only, i.e. 
not from
+                       // the cleaned tag.
+                       text = $utterance.children( 'nodes' ).contents().filter(
+                               function () {
+                                       return this.nodeType === 3;
+                               }
+                       ).text();
                        self.requestTts( text, function ( response ) {
                                audioUrl = response.audio;
                                mw.log( 'Setting url for ' + $utterance.attr( 
'id' ) + ': ' +
                                                audioUrl );
                                $audio.attr( 'src', audioUrl );
+                               self.addTokenElements( $utterance, 
response.tokens );
                        } );
                        $utterance.prop( 'requested', true );
                };
@@ -267,7 +260,7 @@
                 *
                 * @param {string} text The utterance string to send in the 
request.
                 * @param {Function} callback Function to be called when a 
response
-                *      is received.
+                *  is received.
                 */
 
                this.requestTts = function ( text, callback ) {
@@ -289,12 +282,117 @@
                                // jscs:enable 
requireCamelCaseOrUpperCaseIdentifiers
                        } );
                        request.onload = function () {
+                               mw.log( 'Response received: ' + 
request.responseText );
                                response = JSON.parse( request.responseText );
                                callback( response );
                        };
                        mw.log( 'Sending request: ' + serverUrl + '?' + 
parameters );
                        request.send( parameters );
                };
+
+               /**
+                * Add token elements to an utterance element.
+                *
+                * Adds a tokens element and populate it with token elements.
+                *
+                * @param $utterance The jQuery object to add tokens to.
+                * @param tokens Array of tokens from a server response, where 
each
+                *  token is an object. For these objects, the property "orth" 
is the
+                *  string used by the TTS to generate audio for the token.
+                */
+
+               this.addTokenElements = function ( $utterance, tokens ) {
+                       var position, $tokens, $nodes, firstTokenIndex, 
removedLength;
+
+                       // The character position in the original HTML. 
Starting at the
+                       // position of the utterance, since that's the earliest 
a child
+                       // token can appear.
+                       position = parseInt( $utterance.attr( 'position' ), 10 
);
+                       $tokens = $( '<tokens></tokens>' ).appendTo( $utterance 
);
+                       $nodes = $utterance.children( 'nodes' );
+                       firstTokenIndex = 0;
+                       mw.log( 'Adding tokens to ' + $utterance.attr( 'id' ) + 
':' );
+                       $nodes.contents().each( function ( i, element ) {
+                               if ( element.tagName === 'CLEANED-TAG' ) {
+                                       removedLength = element.getAttribute( 
'removed' );
+                                       if ( removedLength !== null ) {
+                                               position += parseInt( 
removedLength, 10 );
+                                       }
+                                       // Advance position two steps extra for 
the < and >,
+                                       // that were stripped from the tag at 
an earlier stage.
+                                       position += 2;
+                               } else {
+                                       // firstTokenIndex is the index, in 
tokens, of the first
+                                       // token we haven't created an element 
for.
+                                       firstTokenIndex = 
self.addTokensForTextElement(
+                                               tokens,
+                                               element,
+                                               position,
+                                               $tokens,
+                                               firstTokenIndex
+                                       );
+                               }
+                               position += element.textContent.length;
+                       } );
+               };
+
+               /**
+                * Add a token element for each token that match a substring of 
the
+                * given text element.
+                *
+                * Goes through textElement, finds substrings matching tokens 
and
+                * creates token elements for these. The position for the token
+                * elements is the substring position plus the position of 
textElement.
+                * When a token can no longer be found, the index of that token 
is
+                * returned to remember what to start looking for in the next 
text
+                * element.
+                *
+                * @param tokens Array of tokens from a server response, where 
each
+                *  token is an object. For these objects, the property "orth" 
is the
+                *  string used by the TTS to generate audio for the token.
+                * @param textElement The text element to match tokens against.
+                * @param {int} startPosition The position of the original text
+                *  element.
+                * @param $tokens Element which token elements are added to.
+                * @param {int} firstTokenIndex The index of the first token in 
tokens
+                *  to search for.
+                * @return {int} The index of the first token that wasn't found.
+                */
+
+               this.addTokensForTextElement = function (
+                       tokens,
+                       textElement,
+                       startPosition,
+                       $tokens,
+                       firstTokenIndex
+               ) {
+                       var positionInElement, matchingPosition, 
tokenPositionInHtml,
+                               orthographicToken, i, token;
+
+                       positionInElement = 0;
+                       for ( i = firstTokenIndex; i < tokens.length; i++ ) {
+                               token = tokens[ i ];
+                               orthographicToken = token.orth;
+                               // Look for the token in the remaining string.
+                               matchingPosition =
+                                       textElement.nodeValue.slice( 
positionInElement )
+                                       .indexOf( orthographicToken );
+                               if ( matchingPosition === -1 ) {
+                                       // The token wasn't found in this 
element. Stop looking for
+                                       // more and return the index of the 
token.
+                                       return i;
+                               }
+                               tokenPositionInHtml = startPosition + 
positionInElement +
+                                       matchingPosition;
+                               mw.log( '  "' + orthographicToken + '", 
position: ' +
+                                               tokenPositionInHtml );
+                               $( '<token></token>' )
+                                       .text( orthographicToken )
+                                       .attr( 'position', tokenPositionInHtml )
+                                       .appendTo( $tokens );
+                               positionInElement += orthographicToken.length;
+                       }
+               };
        }
 
        mw.wikispeech = {};
diff --git a/tests/phpunit/CleanerTest.php b/tests/phpunit/CleanerTest.php
index 1272949..6493f3e 100644
--- a/tests/phpunit/CleanerTest.php
+++ b/tests/phpunit/CleanerTest.php
@@ -7,6 +7,7 @@
  */
 
 require_once __DIR__ . '/../../includes/Cleaner.php';
+require_once 'Util.php';
 
 class CleanerTest extends MediaWikiTestCase {
        protected function setUp() {
@@ -16,14 +17,19 @@
                        'table' => true,
                        'sup' => [ 'class' => 'reference' ],
                        'editsection' => true,
-                       'h2' => false
+                       'h2' => false,
+                       'del' => true
                ];
        }
 
        public function testCleanTags() {
                $markedUpText = '<i>Blonde on Blonde</i>';
-               $expectedText = 'Blonde on Blonde';
-               $this->assertTextCleaned( $expectedText, $markedUpText );
+               $expectedCleanedContent = [
+                       Util::createStartTag( '<i>', 'Blonde on Blonde' ),
+                       'Blonde on Blonde',
+                       new CleanedEndTag( '</i>' )
+               ];
+               $this->assertTextCleaned( $expectedCleanedContent, 
$markedUpText );
        }
 
        /**
@@ -35,89 +41,220 @@
         * should not be altered.
         *
         * @since 0.0.1
-        * @param string $expectedText The string that is the expected output
-        * from the function named by $function.
+        * @param array $expectedCleanedContent The content that is the expected
+        *  output.
         * @param string $markedUpText The string that contains the markup
-        * that should be cleaned. Used as input to the function named by
-        * $function.
+        *  that should be cleaned
         */
 
-       private function assertTextCleaned( $expectedText, $markedUpText ) {
+       private function assertTextCleaned(
+               $expectedCleanedContent,
+               $markedUpText
+       ) {
                $this->assertEquals(
-                       $expectedText,
+                       $expectedCleanedContent,
                        Cleaner::cleanHtml( $markedUpText )
                );
-               $this->assertEquals( 'prefix' . $expectedText . 'suffix',
-                       Cleaner::cleanHtml( 'prefix' . $markedUpText . 'suffix' 
) );
-               $this->assertEquals( $expectedText . 'infix' . $expectedText,
-                       Cleaner::cleanHtml( $markedUpText . 'infix' . 
$markedUpText ) );
-               $this->assertEquals( 'A string without any fancy markup.',
-                       Cleaner::cleanHtml( 'A string without any fancy 
markup.' ) );
+               $this->assertEquals(
+                       array_merge( [ 'prefix' ], $expectedCleanedContent, [ 
'suffix' ] ),
+                       Cleaner::cleanHtml( 'prefix' . $markedUpText . 'suffix' 
)
+               );
+               $this->assertEquals(
+                       array_merge( $expectedCleanedContent, [ 'infix' ], 
$expectedCleanedContent ),
+                       Cleaner::cleanHtml( $markedUpText . 'infix' . 
$markedUpText )
+               );
+               $this->assertEquals(
+                       [ 'A string without any fancy markup.' ],
+                       Cleaner::cleanHtml( 'A string without any fancy 
markup.' )
+               );
        }
 
        public function testCleanNestedTags() {
                $markedUpText = '<i><b>Blonde on Blonde</b></i>';
-               $expectedText = 'Blonde on Blonde';
-               $this->assertTextCleaned( $expectedText, $markedUpText );
+               $expectedCleanedContent = [
+                       Util::createStartTag( '<i>', '<b>Blonde on Blonde</b>' 
),
+                       Util::createStartTag( '<b>', 'Blonde on Blonde' ),
+                       'Blonde on Blonde',
+                       new CleanedEndTag( '</b>' ),
+                       new CleanedEndTag( '</i>' )
+               ];
+               $this->assertTextCleaned( $expectedCleanedContent, 
$markedUpText );
        }
 
        public function testCleanEmptyTags() {
                $markedUpText = '<img alt="" src="image.png" />';
-               $expectedText = '';
-               $this->assertTextCleaned( $expectedText, $markedUpText );
+               $expectedCleanedContent = [
+                       new CleanedEmptyTag( '<img alt="" src="image.png" />' )
+               ];
+               $this->assertTextCleaned( $expectedCleanedContent, 
$markedUpText );
        }
 
-       public function testRemoveTagsAltogether() {
+       public function testRemoveTagsCompletely() {
                $markedUpText = '<table>Remove this table, please.</table>';
-               $expectedText = '';
-               $this->assertTextCleaned( $expectedText, $markedUpText );
+               $expectedCleanedContent = [
+                       Util::createStartTag(
+                               '<table>',
+                               'Remove this table, please.',
+                               true
+                       ),
+                       new CleanedEndTag( '</table>' )
+               ];
+               $this->assertTextCleaned( $expectedCleanedContent, 
$markedUpText );
        }
 
        public function testRemoveTagsWithCertainClass() {
-               $markedUpText = '<sup class="reference"><a>[1]</a>Also remove 
this.</sup>';
-               $expectedText = '';
-               $this->assertTextCleaned( $expectedText, $markedUpText );
+               $markedUpText = '<sup class="reference">Remove this.</sup>';
+               $expectedCleanedContent = [
+                       Util::createStartTag(
+                               '<sup class="reference">',
+                               'Remove this.',
+                               true
+                       ),
+                       new CleanedEndTag( '</sup>' )
+               ];
+               $this->assertTextCleaned( $expectedCleanedContent, 
$markedUpText );
        }
 
        public function testDontRemoveTagsWhichCriteriaAreFalse() {
                $markedUpText = '<h2>Contents</h2>';
-               $expectedText = 'Contents';
-               $this->assertTextCleaned( $expectedText, $markedUpText );
+               $expectedCleanedContent = [
+                       Util::createStartTag( '<h2>', 'Contents' ),
+                       'Contents',
+                       new CleanedEndTag( '</h2>' )
+               ];
+               $this->assertTextCleaned( $expectedCleanedContent, 
$markedUpText );
        }
 
        public function testDontRemoveTagsWithoutCertainClass() {
                // @codingStandardsIgnoreStart
                $markedUpText = '<sup>I am not a reference.</sup><sup 
class="not-a-reference">Neither am I.</sup>';
                // @codingStandardsIgnoreEnd
-               $expectedText = 'I am not a reference.Neither am I.';
-               $this->assertTextCleaned( $expectedText, $markedUpText );
+               $expectedCleanedContent = [
+                       Util::createStartTag( '<sup>', 'I am not a reference.' 
),
+                       'I am not a reference.',
+                       new CleanedEndTag( '</sup>' ),
+                       Util::createStartTag(
+                               '<sup class="not-a-reference">',
+                               'Neither am I.'
+                       ),
+                       'Neither am I.',
+                       new CleanedEndTag( '</sup>' )
+               ];
+               $this->assertTextCleaned( $expectedCleanedContent, 
$markedUpText );
        }
 
        public function testHandleMultipleClasses() {
                // @codingStandardsIgnoreStart
-               $markedUpText = '<sup class="reference another-class"><a 
href="#cite_note-Grayp5-1">[1]</a>Also remove this.</sup>';
+               $markedUpText = '<sup class="reference another-class">Remove 
this.</sup>';
                // @codingStandardsIgnoreEnd
-               $expectedText = '';
-               $this->assertTextCleaned( $expectedText, $markedUpText );
+               $expectedCleanedContent = [
+                       Util::createStartTag(
+                               '<sup class="reference another-class">',
+                               'Remove this.',
+                               true
+                       ),
+                       new CleanedEndTag( '</sup>' )
+               ];
+               $this->assertTextCleaned( $expectedCleanedContent, 
$markedUpText );
        }
 
        public function testCleanNestedTagsWhereSomeAreRemovedAndSomeAreKept() {
                // @codingStandardsIgnoreStart
                $markedUpText = '<h2><span class="mw-headline" 
id="Recording_sessions">Recording sessions</span><mw:editsection page="Test 
Page" section="1">Recording sessions *REMOVE THIS*</mw:editsection></h2>';
                // @codingStandardsIgnoreEnd
-               $expectedText = 'Recording sessions';
-               $this->assertTextCleaned( $expectedText, $markedUpText );
+               $expectedCleanedContent = [
+                       Util::createStartTag(
+                               '<h2>',
+               // @codingStandardsIgnoreStart
+                               '<span class="mw-headline" 
id="Recording_sessions">Recording sessions</span><mw:editsection page="Test 
Page" section="1">Recording sessions *REMOVE THIS*</mw:editsection>'
+               // @codingStandardsIgnoreEnd
+                       ),
+                       Util::createStartTag(
+                               '<span class="mw-headline" 
id="Recording_sessions">',
+                               'Recording sessions'
+                       ),
+                       'Recording sessions',
+                       new CleanedEndTag( '</span>' ),
+                       Util::createStartTag(
+                               '<mw:editsection page="Test Page" section="1">',
+                               'Recording sessions *REMOVE THIS*',
+                               true
+                       ),
+                       new CleanedEndTag( '</mw:editsection>' ),
+                       new CleanedEndTag( '</h2>' )
+               ];
+               $this->assertTextCleaned( $expectedCleanedContent, 
$markedUpText );
        }
 
        public function testHandleUtf8Characters() {
                $markedUpText = '—';
-               $expectedText = '—';
-               $this->assertTextCleaned( $expectedText, $markedUpText );
+               $expectedCleanedContent = [ '—' ];
+               $actualText = Cleaner::cleanHtml( $markedUpText );
+               $this->assertEquals( $expectedCleanedContent, $actualText );
        }
 
        public function testHandleHtmlEntities() {
                $markedUpText = '6&#160;p.m';
-               $expectedText = '6 p.m';
-               $this->assertTextCleaned( $expectedText, $markedUpText );
+               $expectedCleanedContent = [ '6 p.m' ];
+               $actualText = Cleaner::cleanHtml( $markedUpText );
+               $this->assertEquals( $expectedCleanedContent, $actualText );
+       }
+
+       public function testHandleNewlines() {
+               $markedUpText = "<i>Blonde on Blonde\n</i>";
+               $expectedCleanedContent = [
+                       Util::createStartTag( '<i>', "Blonde on Blonde\n" ),
+                       "Blonde on Blonde\n",
+                       new CleanedEndTag( '</i>' )
+               ];
+               $this->assertTextCleaned( $expectedCleanedContent, 
$markedUpText );
+       }
+
+       public function testHandleEndTagFollowedByEmptyTag() {
+               $markedUpText = '<a>c</a><br />';
+               $expectedCleanedContent = [
+                       Util::createStartTag( '<a>', 'c' ),
+                       'c',
+                       new CleanedEndTag( '</a>' ),
+                       new CleanedEmptyTag( '<br />' ),
+               ];
+               $this->assertTextCleaned( $expectedCleanedContent, 
$markedUpText );
+       }
+
+       public function testHandleEmptyTagInsideElement() {
+               $markedUpText = '<a>c<br /></a>';
+               $expectedCleanedContent = [
+                       Util::createStartTag( '<a>', 'c<br />' ),
+                       'c',
+                       new CleanedEmptyTag( '<br />' ),
+                       new CleanedEndTag( '</a>' ),
+               ];
+               $this->assertTextCleaned( $expectedCleanedContent, 
$markedUpText );
+       }
+
+       public function testCleanRemovedTags() {
+               $markedUpText = '<i>Blonde on <del>not </del>Blonde</i>';
+               $expectedCleanedContent = [
+                       Util::createStartTag( '<i>', 'Blonde on <del>not 
</del>Blonde' ),
+                       'Blonde on ',
+                       Util::createStartTag( '<del>', 'not ', true ),
+                       new CleanedEndTag( '</del>' ),
+                       'Blonde',
+                       new CleanedEndTag( '</i>' )
+               ];
+               $this->assertTextCleaned( $expectedCleanedContent, 
$markedUpText );
+       }
+
+       public function testCleanNestedRemovedTags() {
+               $markedUpText = '<i>Blonde on <del><u>not</u> </del>Blonde</i>';
+               $expectedCleanedContent = [
+                       Util::createStartTag( '<i>', 'Blonde on <del><u>not</u> 
</del>Blonde' ),
+                       'Blonde on ',
+                       Util::createStartTag( '<del>', '<u>not</u> ', true ),
+                       new CleanedEndTag( '</del>' ),
+                       'Blonde',
+                       new CleanedEndTag( '</i>' )
+               ];
+               $this->assertTextCleaned( $expectedCleanedContent, 
$markedUpText );
        }
 }
diff --git a/tests/phpunit/HtmlGeneratorTest.php 
b/tests/phpunit/HtmlGeneratorTest.php
index 6f26234..c0bcb2f 100644
--- a/tests/phpunit/HtmlGeneratorTest.php
+++ b/tests/phpunit/HtmlGeneratorTest.php
@@ -7,39 +7,101 @@
  */
 
 require_once __DIR__ . '/../../includes/HtmlGenerator.php';
+require_once 'Util.php';
 
 class HtmlGeneratorTest extends MediaWikiTestCase {
-       public function testGenerateUtterancesHtml() {
-               $utterancesStrings = [ 'An utterance.', 'Another utterance.' ];
-               $actualHtml = HtmlGenerator::generateUtterancesHtml(
-                       $utterancesStrings
-               );
-               // @codingStandardsIgnoreStart
-               $expectedHtml = '<utterances hidden=""><utterance 
id="utterance-0"><text>An 
utterance.</text><audio></audio></utterance><utterance 
id="utterance-1"><text>Another 
utterance.</text><audio></audio></utterance></utterances>';
-               // @codingStandardsIgnoreEnd
-               $this->assertEquals( $expectedHtml, $actualHtml );
-       }
-
-       public function testGenerateUtteranceContainingNumberSign() {
-               // @codingStandardsIgnoreStart
-               $utterancesStrings = [ 'Blonde on Blonde spawned two singles 
that were top-twenty hits in the US: "Rainy Day Women #12 & 35" and "I Want 
You".'
+       public function testCreateUtterancesHtml() {
+               $segments = [
+                       [
+                               'position' => 0,
+                               'content' => [
+                                       Util::createStartTag( '<i>', '<b>Blonde 
on Blonde</b>' ),
+                                       Util::createStartTag( '<b>', 'Blonde on 
Blonde' ),
+                                       'Blonde on Blonde',
+                                       new CleanedEndTag( '</b>' ),
+                                       new CleanedEndTag( '</i>' ),
+                                       ' is the seventh studio album by 
American singer-songwriter ',
+                                       Util::createStartTag(
+                                               '<a href="Bob_Dylan">',
+                                               'Bob Dylan'
+                                       ),
+                                       'Bob Dylan',
+                                       new CleanedEndTag( '</a>' ),
+                                       '.'
+                               ]
+                       ],
+                       [
+                               'position' => 123,
+                               'content' => [
+                                       ' Recording sessions began in ',
+                                       Util::createStartTag(
+                                               '<a href="New_York">',
+                                               'New York'
+                                       ),
+                                       'New York',
+                                       new CleanedEndTag( '</a>' ),
+                                       ' in October 1965.'
+                               ]
+                       ]
                ];
-               // @codingStandardsIgnoreEnd
-               $actualHtml = HtmlGenerator::generateUtterancesHtml(
-                       $utterancesStrings
-               );
+               $actualHtml = HtmlGenerator::createUtterancesHtml( $segments );
                // @codingStandardsIgnoreStart
-               $expectedHtml = '<utterances hidden=""><utterance 
id="utterance-0"><text>Blonde on Blonde spawned two singles that were 
top-twenty hits in the US: "Rainy Day Women #12 & 35" and "I Want 
You".</text><audio></audio></utterance></utterances>';
+               $expectedHtml =
+                       '<utterances hidden=""><utterance id="utterance-0" 
position="0"><nodes><cleaned-tag>i</cleaned-tag><cleaned-tag>b</cleaned-tag>Blonde
 on Blonde<cleaned-tag>/b</cleaned-tag><cleaned-tag>/i</cleaned-tag> is the 
seventh studio album by American singer-songwriter <cleaned-tag>a 
href="Bob_Dylan"</cleaned-tag>Bob 
Dylan<cleaned-tag>/a</cleaned-tag>.</nodes></utterance><utterance 
id="utterance-1" position="123"><nodes> Recording sessions began in 
<cleaned-tag>a href="New_York"</cleaned-tag>New 
York<cleaned-tag>/a</cleaned-tag> in October 
1965.</nodes></utterance></utterances>';
                // @codingStandardsIgnoreEnd
                $this->assertEquals( $expectedHtml, $actualHtml );
        }
 
-       public function testDontGenerateUtterancesHtmlForNoUtterances() {
-               $utterancesStrings = [];
-               $actualHtml = HtmlGenerator::generateUtterancesHtml(
-                       $utterancesStrings
+       public function testCreateUtteranceContainingNumberSign() {
+               $segments = [
+                       [
+                               'position' => 0,
+                               'content' => [
+                                       // @codingStandardsIgnoreStart
+                                       'Blonde on Blonde spawned two singles 
that were top-twenty hits in the US: "Rainy Day Women #12 & 35" and "I Want 
You".'
+                                       // @codingStandardsIgnoreEnd
+                               ]
+                       ]
+               ];
+               $actualHtml = HtmlGenerator::createUtterancesHtml(
+                       $segments
+               );
+               // @codingStandardsIgnoreStart
+               $expectedHtml = '<utterances hidden=""><utterance 
id="utterance-0" position="0"><nodes>Blonde on Blonde spawned two singles that 
were top-twenty hits in the US: "Rainy Day Women #12 & 35" and "I Want 
You".</nodes></utterance></utterances>';
+               // @codingStandardsIgnoreEnd
+               $this->assertEquals( $expectedHtml, $actualHtml );
+       }
+
+       public function testDontCreateUtterancesHtmlForNoUtterances() {
+               $segments = [];
+               $actualHtml = HtmlGenerator::createUtterancesHtml(
+                       $segments
                );
                $expectedHtml = '';
                $this->assertEquals( $expectedHtml, $actualHtml );
        }
+
+       public function testCreateUtterancesContainingRemovedTags() {
+               $segments = [
+                       [
+                               'position' => 0,
+                               'content' => [
+                                       Util::createStartTag( '<i>', '<b>Blonde 
on <del>not </del>Blonde</b>' ),
+                                       Util::createStartTag( '<b>', 'Blonde on 
<del>not </del>Blonde' ),
+                                       'Blonde on ',
+                                       Util::createStartTag( '<del>', 'not ', 
true ),
+                                       new CleanedEndTag( '</del>' ),
+                                       'Blonde',
+                                       new CleanedEndTag( '</b>' ),
+                                       new CleanedEndTag( '</i>' )
+                               ]
+                       ]
+               ];
+               $actualHtml = HtmlGenerator::createUtterancesHtml( $segments );
+               // @codingStandardsIgnoreStart
+               $expectedHtml =
+                       '<utterances hidden=""><utterance id="utterance-0" 
position="0"><nodes><cleaned-tag>i</cleaned-tag><cleaned-tag>b</cleaned-tag>Blonde
 on <cleaned-tag 
removed="4">del</cleaned-tag><cleaned-tag>/del</cleaned-tag>Blonde<cleaned-tag>/b</cleaned-tag><cleaned-tag>/i</cleaned-tag></nodes></utterance></utterances>';
+               // @codingStandardsIgnoreEnd
+               $this->assertEquals( $expectedHtml, $actualHtml );
+       }
 }
diff --git a/tests/phpunit/SegmenterTest.php b/tests/phpunit/SegmenterTest.php
index 4f70d97..bba7dc8 100644
--- a/tests/phpunit/SegmenterTest.php
+++ b/tests/phpunit/SegmenterTest.php
@@ -7,74 +7,206 @@
  */
 
 require_once __DIR__ . '/../../includes/Segmenter.php';
+require_once 'Util.php';
 
 class SegmenterTest extends MediaWikiTestCase {
-
        public function testSegmentSentences() {
-               // @codingStandardsIgnoreStart
-               $input = "Blonde on Blonde is the seventh studio album by 
American singer-songwriter Bob Dylan, released on May 16, 1966, on Columbia 
Records. Recording sessions began in New York in October 1965 with numerous 
backing musicians, including members of Dylan's live backing band, the Hawks.";
+               $cleanedContent = [
+                       Util::createStartTag( '<i>', '<b>Blonde on Blonde</b>' 
),
+                       Util::createStartTag( '<b>', 'Blonde on Blonde' ),
+                       'Blonde on Blonde',
+                       new CleanedEndTag( '</b>' ),
+                       new CleanedEndTag( '</i>' ),
+                       ' is the seventh studio album by American 
singer-songwriter ',
+                       Util::createStartTag( '<a href="Bob_Dylan">', 'Bob 
Dylan' ),
+                       'Bob Dylan',
+                       new CleanedEndTag( '</a>' ),
+                       '. Recording sessions began in ',
+                       Util::createStartTag( '<a href="New_York">', 'New York' 
),
+                       'New York',
+                       new CleanedEndTag( '</a>' ),
+                       ' in October 1965.'
+               ];
                $expectedSegments = [
-                       'Blonde on Blonde is the seventh studio album by 
American singer-songwriter Bob Dylan, released on May 16, 1966, on Columbia 
Records.',
-                       "Recording sessions began in New York in October 1965 
with numerous backing musicians, including members of Dylan's live backing 
band, the Hawks." ];
-               // @codingStandardsIgnoreEnd
-               $segments = Segmenter::segmentSentences( $input );
+                       [
+                               'position' => 0,
+                               'content' => [
+                                       Util::createStartTag(
+                                               '<i>',
+                                               '<b>Blonde on Blonde</b>'
+                                       ),
+                                       Util::createStartTag(
+                                               '<b>',
+                                               'Blonde on Blonde'
+                                       ),
+                                       'Blonde on Blonde',
+                                       new CleanedEndTag( '</b>' ),
+                                       new CleanedEndTag( '</i>' ),
+                                       ' is the seventh studio album by 
American singer-songwriter ',
+                                       Util::createStartTag(
+                                               '<a href="Bob_Dylan">',
+                                               'Bob Dylan'
+                                       ),
+                                       'Bob Dylan',
+                                       new CleanedEndTag( '</a>' ),
+                                       '.'
+                               ]
+                       ],
+                       [
+                               'position' => 123,
+                               'content' => [
+                                       ' Recording sessions began in ',
+                                       Util::createStartTag(
+                                               '<a href="New_York">',
+                                               'New York'
+                                       ),
+                                       'New York',
+                                       new CleanedEndTag( '</a>' ),
+                                       ' in October 1965.'
+                               ]
+                       ]
+               ];
+               $segments = Segmenter::segmentSentences( $cleanedContent );
                $this->assertEquals( $expectedSegments, $segments );
        }
 
        public function testDontSegmentByEllipses() {
-               $input = "I mean, in ten recording sessions, man, we didn't get 
one song...It was the band.";
+               $cleanedContent = [
+                       "I mean, in ten recording sessions, man, we didn't get 
one song...It was the band."
+                       ];
                $expectedSegments = [
-                       "I mean, in ten recording sessions, man, we didn't get 
one song...It was the band." ];
-               $segments = Segmenter::segmentSentences( $input );
+                       [
+                               'position' => 0,
+                               'content' => [
+                                       "I mean, in ten recording sessions, 
man, we didn't get one song...It was the band."
+                               ]
+                       ]
+               ];
+               $segments = Segmenter::segmentSentences( $cleanedContent );
                $this->assertEquals( $expectedSegments, $segments );
        }
 
        public function testDontSegmentByAbbreviations() {
                // @codingStandardsIgnoreStart
-               $input = 'On February 15 the session began at 6&nbsp;p.m. but 
Dylan simply sat in the studio working on his lyrics while the musicians played 
cards, napped and chatted.';
+               $cleanedContent = [
+                       'On February 15 the session began at 6&nbsp;p.m. but 
Dylan simply sat in the studio working on his lyrics while the musicians played 
cards, napped and chatted.'
+                       ];
                $expectedSegments = [
-                       'On February 15 the session began at 6&nbsp;p.m. but 
Dylan simply sat in the studio working on his lyrics while the musicians played 
cards, napped and chatted.' ];
+                       [
+                               'position' => 0,
+                               'content' => [
+                                       'On February 15 the session began at 
6&nbsp;p.m. but Dylan simply sat in the studio working on his lyrics while the 
musicians played cards, napped and chatted.'
+                               ]
+                       ]
+               ];
                // @codingStandardsIgnoreEnd
-               $segments = Segmenter::segmentSentences( $input );
+               $segments = Segmenter::segmentSentences( $cleanedContent );
                $this->assertEquals( $expectedSegments, $segments );
        }
 
        public function testDontSegmentByDotDirectlyFollowedByComma() {
-               // @codingStandardsIgnoreStart
-               $input = 'Two people had strongly recommended the Hawks to 
Dylan: Mary Martin, the executive secretary of Albert Grossman, and blues 
singer John Hammond, Jr., son of record producer John Hammond, who had signed 
Dylan to Columbia Records in 1961; the Hawks had backed the younger Hammond on 
his 1965 album So Many Roads.';
+               $cleanedContent = [
+                       'and blues singer John Hammond, Jr., son of record 
producer John Hammond'
+               ];
                $expectedSegments = [
-                       'Two people had strongly recommended the Hawks to 
Dylan: Mary Martin, the executive secretary of Albert Grossman, and blues 
singer John Hammond, Jr., son of record producer John Hammond, who had signed 
Dylan to Columbia Records in 1961; the Hawks had backed the younger Hammond on 
his 1965 album So Many Roads.' ];
-               // @codingStandardsIgnoreEnd
-               $segments = Segmenter::segmentSentences( $input );
-               $this->assertEquals( $expectedSegments, $segments );
-       }
-
-       public function testDontRemoveStringsWithoutDots() {
-               $input = "Recording sessions\n\nBackground";
-               $expectedSegments = [ 'Recording sessions', 'Background' ];
-               $segments = Segmenter::segmentSentences( $input );
-               $this->assertEquals( $expectedSegments, $segments );
-       }
-
-       public function testSegmentParagraphs() {
-               $input = "Recording sessions
-
-Background
-After the release of Highway 61 Revisited in August 1965, Dylan set ...";
-               $expectedSegments = [
-                       'Recording sessions',
-                       'Background',
-                       'After the release of Highway 61 Revisited in August 
1965, Dylan set ...' ];
-               $segments = Segmenter::segmentParagraphs( $input );
+                       [
+                               'position' => 0,
+                               'content' => [
+                                       'and blues singer John Hammond, Jr., 
son of record producer John Hammond'
+                               ]
+                       ]
+               ];
+               $segments = Segmenter::segmentSentences( $cleanedContent );
                $this->assertEquals( $expectedSegments, $segments );
        }
 
        public function testDontSegmentByDecimalDot() {
-               $input = 'the two-CD set went on sale for $18.99 and the 
three-CD version for $129.99';
+               $cleanedContent = [
+                       'the two-CD set went on sale for $18.99 and the 
three-CD version for $129.99'
+                       ];
                // @codingStandardsIgnoreStart
-               $expectedSegments = [ 'the two-CD set went on sale for $18.99 
and the three-CD version for $129.99' ];
+               $expectedSegments = [
+                       [
+                               'position' => 0,
+                               'content' => [
+                                       'the two-CD set went on sale for $18.99 
and the three-CD version for $129.99'
+                               ]
+                       ]
+               ];
                // @codingStandardsIgnoreEnd
-               $segments = Segmenter::segmentParagraphs( $input );
+               $segments = Segmenter::segmentSentences( $cleanedContent );
+               $this->assertEquals( $expectedSegments, $segments );
+       }
+
+       public function 
testKeepLastSegmentEvenIfNotEndingWithSentenceFinalCharacter() {
+               $cleanedContent = [ 'Recording sessions' ];
+               $expectedSegments = [
+                       [
+                               'position' => 0,
+                               'content' => [ 'Recording sessions' ]
+                       ]
+               ];
+               $segments = Segmenter::segmentSentences( $cleanedContent );
+               $this->assertEquals( $expectedSegments, $segments );
+       }
+
+       public function testSegmentEndingWithTag() {
+               $cleanedContent = [
+                       Util::createStartTag( '<i>', '<b>Blonde on Blonde</b>' 
),
+                       Util::createStartTag( '<b>', 'Blonde on Blonde' ),
+                       'Blonde on Blonde',
+                       new CleanedEndTag( '</b>' ),
+                       new CleanedEndTag( '</i>' )
+               ];
+               $expectedSegments = [
+                       [
+                               'position' => 0,
+                               'content' => [
+                                       Util::createStartTag(
+                                               '<i>',
+                                               '<b>Blonde on Blonde</b>'
+                                       ),
+                                       Util::createStartTag(
+                                               '<b>',
+                                               'Blonde on Blonde'
+                                       ),
+                                       'Blonde on Blonde',
+                                       new CleanedEndTag( '</b>' ),
+                                       new CleanedEndTag( '</i>' )
+                               ]
+                       ]
+               ];
+               $segments = Segmenter::segmentSentences( $cleanedContent );
+               $this->assertEquals( $expectedSegments, $segments );
+       }
+
+       public function testSegmentsContainRemovedTags() {
+               $cleanedContent = [
+                       'Blonde on Blonde is ',
+                       Util::createStartTag( '<del>', 'not ', true ),
+                       new CleanedEndTag( '</del>' ),
+            // @codingStandardsIgnoreStart
+                       'the seventh studio album by American singer-songwriter 
Bob Dylan. Recording sessions began in New York in October 1965.'
+            // @codingStandardsIgnoreEnd
+               ];
+               $expectedSegments = [
+                       [
+                               'position' => 0,
+                               'content' => [
+                                       'Blonde on Blonde is ',
+                                       Util::createStartTag( '<del>', 'not ', 
true ),
+                                       new CleanedEndTag( '</del>' ),
+                                       'the seventh studio album by American 
singer-songwriter Bob Dylan.'
+                               ]
+                       ],
+                       [
+                               'position' => 100,
+                               'content' => [
+                                       ' Recording sessions began in New York 
in October 1965.'
+                               ]
+                       ]
+               ];
+               $segments = Segmenter::segmentSentences( $cleanedContent );
                $this->assertEquals( $expectedSegments, $segments );
        }
 }
diff --git a/tests/phpunit/Util.php b/tests/phpunit/Util.php
new file mode 100644
index 0000000..5be44f8
--- /dev/null
+++ b/tests/phpunit/Util.php
@@ -0,0 +1,33 @@
+<?php
+
+/**
+ * @file
+ * @ingroup Extensions
+ * @license GPL-2.0+
+ */
+
+class Util {
+
+       /**
+        * Create a CleanedStartTag and set it's $contentLength.
+        *
+        * @since 0.0.1
+        * @param string $tagString The tag string for the CleanedStartTag.
+        * @param string $contentString The content string, used for calculating
+        *  $contentLength for the CleanedStartTag.
+        * @return CleanedStartTag
+        */
+
+       public static function createStartTag(
+               $tagString,
+               $contentString,
+               $removed=false
+       ) {
+               $cleanedTag = new CleanedStartTag( $tagString );
+               if ( $removed ) {
+                       $cleanedTag->removed = true;
+               }
+               $cleanedTag->contentLength = strlen( $contentString );
+               return $cleanedTag;
+       }
+}
diff --git a/tests/qunit/ext.wikispeech.test.js 
b/tests/qunit/ext.wikispeech.test.js
index 9ae7fb2..e8ff326 100644
--- a/tests/qunit/ext.wikispeech.test.js
+++ b/tests/qunit/ext.wikispeech.test.js
@@ -5,16 +5,20 @@
                setup: function () {
                        wikispeech = new mw.wikispeech.Wikispeech();
                        server = sinon.fakeServer.create();
-                       server.respondWith( '{"audio": 
"http://server.url/audio"}' );
+                       server.respondWith(
+                               '{"audio": "http://server.url/audio";, "tokens": 
[{"orth": "tokens"}, {"orth": "from"}, {"orth": "server"}]}'
+                       );
                        // overrideMimeType() isn't defined by default.
                        server.xhr.prototype.overrideMimeType = function () {};
                        $( '#qunit-fixture' ).append( createUtteranceElement(
-                               'utterance-0',
-                               'A mockup utterance.'
+                               0,
+                               0,
+                               
'<cleaned-tag>i</cleaned-tag><cleaned-tag>b</cleaned-tag>Blonde on 
Blonde<cleaned-tag>/b</cleaned-tag><cleaned-tag>/i</cleaned-tag> is the seventh 
studio album by American singer-songwriter <cleaned-tag>a 
href="Bob_Dylan"</cleaned-tag>Bob Dylan<cleaned-tag>/a</cleaned-tag>.'
                        ) );
                        $( '#qunit-fixture' ).append( createUtteranceElement(
-                               'utterance-1',
-                               'Another mockup utterance.'
+                               1,
+                               123,
+                               ' Recording sessions began in <cleaned-tag>a 
href="New_York"</cleaned-tag>New York<cleaned-tag>/a</cleaned-tag> in October 
1965.'
                        ) );
                        $( '#qunit-fixture' ).append(
                                $( '<h1></h1>' ).attr( 'id', 'firstHeading' )
@@ -37,12 +41,12 @@
                }
        } );
 
-       function createUtteranceElement( id, text ) {
+       function createUtteranceElement( id, position, text ) {
                return $( '<utterance></utterance>' )
-                       .attr( 'id', id )
-                       .append( $( '<text></text>' )
-                               .text( text ) )
-                       .append( $( '<audio></audio>' ) );
+                       .attr( 'id', 'utterance-' + id )
+                       .attr( 'position', position )
+                       .append( $( '<nodes></nodes>' )
+                               .html( text ) );
        }
 
        QUnit.test( 'prepareUtterance', function ( assert ) {
@@ -70,11 +74,9 @@
        } );
 
        QUnit.test( 'prepareUtterance: prepare next utterance when playing', 
function ( assert ) {
-               var $nextUtterance;
-
+               var $nextUtterance = $( '#utterance-1' );
                assert.expect( 1 );
                wikispeech.prepareUtterance( $( '#utterance-0' ) );
-               $nextUtterance = $( '#utterance-1' );
                sinon.spy( wikispeech, 'prepareUtterance' );
 
                $( '#utterance-0 audio' ).trigger( 'play' );
@@ -115,11 +117,9 @@
        } );
 
        QUnit.test( 'prepareUtterance: stop when end of text is reached', 
function ( assert ) {
-               var $lastUtterance;
-
+               var $lastUtterance = $( '#utterance-1' );
                assert.expect( 1 );
                sinon.spy( wikispeech, 'stop' );
-               $lastUtterance = $( '#utterance-1' );
                wikispeech.prepareUtterance( $lastUtterance );
                wikispeech.playUtterance( $lastUtterance );
 
@@ -129,14 +129,15 @@
        } );
 
        QUnit.test( 'loadAudio', function ( assert ) {
-               assert.expect( 3 );
+               assert.expect( 4 );
+               sinon.spy( wikispeech, 'addTokenElements' );
 
                wikispeech.loadAudio( $( '#utterance-0' ) );
 
                server.respond();
                assert.strictEqual(
                        server.requests[ 0 ].requestBody,
-                       'lang=en&input_type=text&input=A+mockup+utterance.'
+                       
'lang=en&input_type=text&input=Blonde+on+Blonde+is+the+seventh+studio+album+by+American+singer-songwriter+Bob+Dylan.'
                );
                assert.strictEqual(
                        $( '#utterance-0 audio' ).attr( 'src' ),
@@ -144,6 +145,13 @@
                );
                assert.strictEqual(
                        $( '#utterance-0' ).prop( 'requested' ),
+                       true
+               );
+               assert.strictEqual(
+                       wikispeech.addTokenElements.calledWith(
+                               $( '#utterance-0' ),
+                               [ { orth: 'tokens' }, { orth: 'from' }, { orth: 
'server' } ]
+                       ),
                        true
                );
        } );
@@ -208,9 +216,7 @@
         */
 
        function createKeydownEvent( keyCode, modifiers ) {
-               var event;
-
-               event = $.Event( 'keydown' );
+               var event = $.Event( 'keydown' );
                event.which = keyCode;
                event.ctrlKey = modifiers.indexOf( 'c' ) >= 0;
                event.altKey = modifiers.indexOf( 'a' ) >= 0;
@@ -232,6 +238,7 @@
                assert.expect( 4 );
                wikispeech.addPlayStopButton();
                wikispeech.play();
+               wikispeech.prepareUtterance( $( '#utterance-0' ) );
                $( '#utterance-0 audio' ).prop( 'currentTime', 1 );
 
                wikispeech.stop();
@@ -254,11 +261,10 @@
        } );
 
        QUnit.test( 'play', function ( assert ) {
-               var $firstUtterance;
-
+               var $firstUtterance = $( '#utterance-0' );
                assert.expect( 3 );
                wikispeech.addPlayStopButton();
-               $firstUtterance = $( '#utterance-0' );
+               wikispeech.prepareUtterance( $firstUtterance );
 
                wikispeech.play();
 
@@ -293,6 +299,7 @@
 
        QUnit.test( 'skipAheadUtterance', function ( assert ) {
                assert.expect( 2 );
+               wikispeech.prepareUtterance( $( '#utterance-0' ) );
                wikispeech.play();
 
                wikispeech.skipAheadUtterance();
@@ -318,8 +325,8 @@
                var $nextUtterance;
 
                assert.expect( 1 );
-               $nextUtterance =
-                       wikispeech.getNextUtterance( $( '#utterance-0' ) );
+
+               $nextUtterance = wikispeech.getNextUtterance( $( '#utterance-0' 
) );
 
                assert.strictEqual(
                        $nextUtterance.get( 0 ),
@@ -331,8 +338,161 @@
                var $nextUtterance;
 
                assert.expect( 1 );
+
                $nextUtterance = wikispeech.getNextUtterance( $() );
 
                assert.strictEqual( $nextUtterance.length, 0 );
        } );
+
+       QUnit.test( 'addTokenElements', function ( assert ) {
+               var nodesHtml, $utterance, tokens, $actualTokensElement,
+                       $expectedTokensElement;
+
+               nodesHtml = 
'<cleaned-tag>i</cleaned-tag><cleaned-tag>b</cleaned-tag>Blonde on 
Blonde<cleaned-tag>/b</cleaned-tag><cleaned-tag>/i</cleaned-tag>';
+               $utterance = $( '<utterance></utterance>' )
+                       .append(
+                               $( '<nodes></nodes>' )
+                                       .prop( 'innerHTML', nodesHtml )
+                       )
+                       .attr( 'position', '0' );
+               tokens = [
+                       { orth: 'Blonde' },
+                       { orth: 'on' },
+                       { orth: 'Blonde' }
+               ];
+
+               wikispeech.addTokenElements( $utterance, tokens );
+
+               $actualTokensElement = $utterance.children( 'tokens' );
+               $expectedTokensElement = $( '<tokens></tokens>' )
+                       .append( $( '<token></token>' )
+                                       .text( 'Blonde' )
+                                       .attr( 'position', 6 ) )
+                       .append( $( '<token></token>' )
+                                       .text( 'on' )
+                                       .attr( 'position', 13 ) )
+                       .append( $( '<token></token>' )
+                                       .text( 'Blonde' )
+                                       .attr( 'position', 16 ) );
+               assert.strictEqual(
+                       $actualTokensElement.prop( 'outerHTML' ),
+                       $expectedTokensElement.prop( 'outerHTML' )
+               );
+       } );
+
+       QUnit.test( 'addTokenElements: handle tag', function ( assert ) {
+               var nodesHtml, $utterance, tokens, $actualTokensElement,
+                       $expectedTokensElement;
+
+               nodesHtml = 'American singer-songwriter <cleaned-tag>a 
href="/wiki/Bob_Dylan" title="Bob Dylan"</cleaned-tag>Bob 
Dylan<cleaned-tag>/a</cleaned-tag>.';
+               $utterance = $( '<utterance></utterance>' )
+                       .append(
+                               $( '<nodes></nodes>' )
+                                       .prop( 'innerHTML', nodesHtml )
+                       )
+                       .attr( 'position', '0' );
+               tokens = [
+                       { orth: 'American' },
+                       { orth: 'singer-songwriter' },
+                       { orth: 'Bob' },
+                       { orth: 'Dylan' },
+                       { orth: '.' }
+               ];
+
+               wikispeech.addTokenElements( $utterance, tokens );
+
+               $actualTokensElement = $utterance.children( 'tokens' );
+               $expectedTokensElement = $( '<tokens></tokens>' )
+                       .append( $( '<token></token>' )
+                                       .text( 'American' )
+                                       .attr( 'position', 0 ) )
+                       .append( $( '<token></token>' )
+                                       .text( 'singer-songwriter' )
+                                       .attr( 'position', 9 ) )
+                       .append( $( '<token></token>' )
+                                       .text( 'Bob' )
+                                       .attr( 'position', 71 ) )
+                       .append( $( '<token></token>' )
+                                       .text( 'Dylan' )
+                                       .attr( 'position', 75 ) )
+                       .append( $( '<token></token>' )
+                                       .text( '.' )
+                                       .attr( 'position', 84 ) );
+               assert.strictEqual(
+                       $actualTokensElement.prop( 'outerHTML' ),
+                       $expectedTokensElement.prop( 'outerHTML' )
+               );
+       } );
+
+       QUnit.test( 'addTokenElements: utterance position offset', function ( 
assert ) {
+               var nodesHtml, $utterance, tokens, $actualTokensElement,
+                       $expectedTokensElement;
+
+               nodesHtml = 
'<cleaned-tag>i</cleaned-tag><cleaned-tag>b</cleaned-tag>Blonde on 
Blonde<cleaned-tag>/b</cleaned-tag><cleaned-tag>/i</cleaned-tag>';
+               $utterance = $( '<utterance></utterance>' )
+                       .append(
+                               $( '<nodes></nodes>' )
+                                       .prop( 'innerHTML', nodesHtml )
+                       )
+                       .attr( 'position', '3' );
+               tokens = [
+                       { orth: 'Blonde' },
+                       { orth: 'on' },
+                       { orth: 'Blonde' }
+               ];
+
+               wikispeech.addTokenElements( $utterance, tokens );
+
+               $actualTokensElement = $utterance.children( 'tokens' );
+               $expectedTokensElement = $( '<tokens></tokens>' )
+                       .append( $( '<token></token>' )
+                                       .text( 'Blonde' )
+                                       .attr( 'position', 9 ) )
+                       .append( $( '<token></token>' )
+                                       .text( 'on' )
+                                       .attr( 'position', 16 ) )
+                       .append( $( '<token></token>' )
+                                       .text( 'Blonde' )
+                                       .attr( 'position', 19 ) );
+               assert.strictEqual(
+                       $actualTokensElement.prop( 'outerHTML' ),
+                       $expectedTokensElement.prop( 'outerHTML' )
+               );
+       } );
+
+       QUnit.test( 'addTokenElements: handle removed element', function ( 
assert ) {
+               var nodesHtml, $utterance, tokens, $actualTokensElement,
+                       $expectedTokensElement;
+
+               nodesHtml = 
'<cleaned-tag>i</cleaned-tag><cleaned-tag>b</cleaned-tag>Blonde on <cleaned-tag 
removed="4">del</cleaned-tag><cleaned-tag>/del</cleaned-tag>Blonde<cleaned-tag>/b</cleaned-tag><cleaned-tag>/i</cleaned-tag>';
+               $utterance = $( '<utterance></utterance>' )
+                       .append(
+                               $( '<nodes></nodes>' )
+                                       .prop( 'innerHTML', nodesHtml )
+                       )
+                       .attr( 'position', '0' );
+               tokens = [
+                       { orth: 'Blonde' },
+                       { orth: 'on' },
+                       { orth: 'Blonde' }
+               ];
+
+               wikispeech.addTokenElements( $utterance, tokens );
+
+               $actualTokensElement = $utterance.children( 'tokens' );
+               $expectedTokensElement = $( '<tokens></tokens>' )
+                       .append( $( '<token></token>' )
+                                       .text( 'Blonde' )
+                                       .attr( 'position', 6 ) )
+                       .append( $( '<token></token>' )
+                                       .text( 'on' )
+                                       .attr( 'position', 13 ) )
+                       .append( $( '<token></token>' )
+                                       .text( 'Blonde' )
+                                       .attr( 'position', 31 ) );
+               assert.strictEqual(
+                       $actualTokensElement.prop( 'outerHTML' ),
+                       $expectedTokensElement.prop( 'outerHTML' )
+               );
+       } );
 } )( mediaWiki, jQuery );

-- 
To view, visit https://gerrit.wikimedia.org/r/314237
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ie784328fa3d7bcf7941b6b89146687272fe3b0ca
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Wikispeech
Gerrit-Branch: master
Gerrit-Owner: Sebastian Berlin (WMSE) <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] mediawiki...Wikispeech[master]: Map tokens from TTS responses to HTML

Reply via email to