[MediaWiki-commits] [Gerrit] mediawiki...Wikispeech[master]: Map tokens from TTS responses to HTML

jenkins-bot (Code Review) Mon, 28 Nov 2016 00:48:54 -0800

jenkins-bot has submitted this change and it was merged.

Change subject: Map tokens from TTS responses to HTML
......................................................................



Map tokens from TTS responses to HTML

Added mapping between the tokens received from the TTS server and the
"words" in the html. Tokens are stored in the utterance elements and
are assigned a position attribute, which is the index of the start of
the corresponding html substring.

Cleaner received an overhaul to store information about the cleaned
tags. Previously, this information was just thrown away. The other
classes in the "PHP-chain" was updated to handle the new information.

Most test, both PHP and Javscript, was refactored to make them easier
to read.

Bug: T140105
Bug: T149174
Bug: T149799
Change-Id: Ie784328fa3d7bcf7941b6b89146687272fe3b0ca
---
M Hooks.php
A includes/CleanedTag.php
M includes/Cleaner.php
M includes/HtmlGenerator.php
M includes/Segmenter.php
M modules/ext.wikispeech.js
M tests/phpunit/CleanerTest.php
M tests/phpunit/HtmlGeneratorTest.php
M tests/phpunit/SegmenterTest.php
A tests/phpunit/Util.php
M tests/qunit/ext.wikispeech.test.js
11 files changed, 1,582 insertions(+), 317 deletions(-)

Approvals:
  Lokal Profil: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/Hooks.php b/Hooks.php
index 33e6c1d..a7c3211 100644
--- a/Hooks.php
+++ b/Hooks.php
@@ -18,7 +18,7 @@
         *
         * @param array $testModules The array of registered test modules
         * @param ResourceLoader $resourceLoader The reference to the resource
-        *      loader
+        *  loader
         * @return true
         */
 
@@ -46,7 +46,7 @@
         * namespace.
         *
         * @param $parser Parser object. Can be used to manually parse a portion
-        *      of wiki text from the $text.
+        *  of wiki text from the $text.
         * @param $text Represents the text for page.
         */
 
@@ -59,14 +59,17 @@
                                'HTML from onParserAfterTidy(): ' . $text
                        );
                        $cleanedText = Cleaner::cleanHtml( $text );
-                       wfDebugLog( 'Wikispeech', 'Cleaned text: ' . 
$cleanedText );
+                       wfDebugLog(
+                               'Wikispeech',
+                               'Cleaned text: ' . var_export( $cleanedText, 
true )
+                       );
                        $utterances = Segmenter::segmentSentences( $cleanedText 
);
                        wfDebugLog(
                                'Wikispeech',
                                'Utterances: ' . var_export( $utterances, true )
                        );
                        $utterancesHtml =
-                               HtmlGenerator::generateUtterancesHtml( 
$utterances );
+                               HtmlGenerator::createUtterancesHtml( 
$utterances );
                        wfDebugLog(
                                'Wikispeech',
                                'Adding utterances HTML: ' . $utterancesHtml
@@ -101,7 +104,7 @@
         *
         * @param OutputPage $out The OutputPage object.
         * @param Skin $skin Skin object that will be used to generate the page,
-        *      added in 1.13.
+        *  added in 1.13.
         */
 
        public static function onBeforePageDisplay(
diff --git a/includes/CleanedTag.php b/includes/CleanedTag.php
new file mode 100644
index 0000000..697032c
--- /dev/null
+++ b/includes/CleanedTag.php
@@ -0,0 +1,73 @@
+<?php
+
+/**
+ * @file
+ * @ingroup Extensions
+ * @license GPL-2.0+
+ */
+
+abstract class CleanedTag {
+
+       /**
+        * The string representation of the tag, as it is written in the
+        * HTML. This includes the tag name, any attributes, and the
+        * brackets.
+        *
+        * @var string $tagString
+        */
+
+       public $tagString;
+
+       function __construct( $tagString ) {
+               $this->tagString = $tagString;
+       }
+
+       /**
+        * Get the length of the tag string.
+        *
+        * @since 0.0.1
+        * @return int The length of the tag string.
+        */
+
+       function getLength() {
+               return strlen( $this->tagString );
+       }
+}
+
+class CleanedStartTag extends CleanedTag {
+
+       /**
+        * The length of the element content, i.e. the string delimited by
+        * this start tag and the corresponding end tag.
+        *
+        * @var int $contentLength
+        */
+
+       public $contentLength;
+
+       function __construct( $tagString ) {
+               parent::__construct( $tagString );
+               $this->contentLength = 0;
+       }
+
+       /**
+        * Get the length of the tag string.
+        *
+        * @since 0.0.1
+        * @return int The length of the tag string, including element content.
+        */
+
+       function getLength() {
+               $length = strlen( $this->tagString );
+               if ( $this->contentLength ) {
+                       $length += $this->contentLength;
+               }
+               return $length;
+       }
+}
+
+class CleanedEndTag extends CleanedTag {
+}
+
+class CleanedEmptyElementTag extends CleanedTag {
+}
diff --git a/includes/Cleaner.php b/includes/Cleaner.php
index 9dcc7e6..e7b4a1f 100644
--- a/includes/Cleaner.php
+++ b/includes/Cleaner.php
@@ -6,22 +6,55 @@
  * @license GPL-2.0+
  */
 
+require_once 'CleanedTag.php';
+
 class Cleaner {
 
        /**
-        * Clean HTML tags by removing some altogether and keeping content
-        * for some.
+        * Clean HTML tags from a string.
+        *
+        * Separates any HTML tags from the text.
         *
         * @since 0.0.1
-        * @param string $markedUpText Input text that may contain HTML tags.
-        * @return string The text with HTML tags removed/replaced with
-        * contents.
+        * @param string $markedUpText Input text that may contain HTML
+        *  tags.
+        * @return array An array of nodes where tags are stored as
+        *  CleanedTags and text nodes as strings.
         */
 
        public static function cleanHtml( $markedUpText ) {
+               $dom = self::createDomDocument( $markedUpText );
+               $tags = self::getTags( $markedUpText );
+               // Start adding the nodes that are children of the dummy
+               // element. To not add the actual dummy tags, index starts on
+               // -1.
+               $tagIndex = -1;
+               $cleanedContent = [];
+               self::addContent(
+                       $cleanedContent,
+                       $dom->documentElement->firstChild,
+                       $markedUpText,
+                       $tags,
+                       $tagIndex
+               );
+               return $cleanedContent;
+       }
+
+       /**
+        * Create a DOMDocument from an HTML string.
+        *
+        * A dummy element is added as top node.
+        *
+        * @since 0.0.1
+        * @param string $markedUpString The string to create the
+        *  DOMDocument.
+        * @return DOMDocument The created DOMDocument.
+        */
+
+       private static function createDomDocument( $markedUpText ) {
                $dom = new DOMDocument();
-               // Add encoding information and wrap the input text in a dummy 
tag
-               // to prevent p tags from being added for text nodes.
+               // Add encoding information and wrap the input text in a dummy
+               // tag to prevent p tags from being added for text nodes.
                // @codingStandardsIgnoreStart
                $wrappedText = '<head><meta http-equiv="Content-Type" 
content="text/html; charset=utf-8"/><dummy>' . $markedUpText . 
'</dummy></head>';
                // @codingStandardsIgnoreEnd
@@ -30,48 +63,370 @@
                        $wrappedText,
                        LIBXML_HTML_NODEFDTD | LIBXML_HTML_NOIMPLIED
                );
-               $cleanedText = self::getTextContent( $dom->documentElement );
-               return $cleanedText;
+               return $dom;
        }
 
        /**
-        * Recursively get the text from a node and its children.
+        * Extract a list of tags from a string.
+        *
+        * Tags are extracted in the order they appear. This is done using
+        * regex since we need the exact string representation of tags to
+        * get their correct lengths.
+        *
+        * When a start tag is encountered, it's stored as an array
+        * containing the tag string and the start position of the
+        * tag. This array is then added to another array, which holds a
+        * start-end pair of tags.
+        *
+        * When an end tag is encountered, it's stored as an array
+        * containing the tag string and the end position of the tag. This
+        * array is then added to the array containing the corresponding
+        * start tag.
+        *
+        * Empty element tags are added as tag strings.
         *
         * @since 0.0.1
-        * @param DOMNode $node The top node to get text from.
-        * @return string The cleaned text from the nodes.
+        * @param string $markedUpText The string to extract tags from.
+        * @return array An array containing the found tags.
         */
 
-       private static function getTextContent( $node ) {
-               $content = '';
-               if ( !self::matchesRemove( $node ) ) {
-                       foreach ( $node->childNodes as $child ) {
-                               if ( $child->nodeType == XML_TEXT_NODE ) {
-                                       $content .= $child->textContent;
-                               } else {
-                                       $content .= self::getTextContent( 
$child );
+       private static function getTags( $markedUpText ) {
+               $potentialTagBrackets = [];
+               preg_match_all(
+                       '/[<>]/',
+                       $markedUpText,
+                       $potentialTagBrackets,
+                       PREG_SET_ORDER | PREG_OFFSET_CAPTURE
+               );
+               $tags = [];
+               $startBracket = null;
+               foreach ( $potentialTagBrackets as $match ) {
+                       // $match[0] is an array containing the matched string 
and it's
+                       // position.
+                       $bracketString = $match[0][0];
+                       if ( $bracketString == '<' ) {
+                               if ( $startBracket == null ) {
+                                       $startBracket = $match[0];
+                               }
+                       } elseif ( $bracketString == '>' && $startBracket != 
null ) {
+                               $tagString = substr(
+                                       $markedUpText,
+                                       $startBracket[1],
+                                       $match[0][1] - $startBracket[1] + 1
+                               );
+                               $bracketPosition = $startBracket[1];
+                               $startBracket = null;
+                               if ( self::isStartTag( $tagString ) ) {
+                                       array_push( $tags, [ [
+                                               'string' => $tagString,
+                                               'position' => $bracketPosition
+                                       ] ] );
+                               } elseif ( self::isEndTag( $tagString ) ) {
+                                       $startTagIndex = 
self::getCorrespondingStartTagIndex(
+                                               $tags,
+                                               $tagString
+                                       );
+                                       // Add the end tag to the array already 
containing the
+                                       // start tag.
+                                       array_push(
+                                               $tags[$startTagIndex],
+                                               [
+                                                       'string' => $tagString,
+                                                       'position' => 
$bracketPosition
+                                               ]
+                                       );
+                               } elseif ( self::isEmptyElementTag( $tagString 
) ) {
+                                       array_push( $tags, $tagString );
                                }
                        }
                }
-               return $content;
+               return $tags;
        }
 
        /**
-        * Check if a tag matches criteria for removal.
-        *
-        * The criteria are defined by $wgWikispeechRemoveTags, which is a map
-        * where the keys are tag names. If the value is true, the tag will be
-        * removed. If the value is an array, it defines further criteria,
-        * currently only class name, which needs to match for the tag to be
-        * removed.
-        *
-        * The value may be false, which means the tag won't be removed. This 
is to
-        * allow overriding default values in LocalSettings.php, but is 
otherwise
-        * not required.
+        * Test if a string is a start tag.
         *
         * @since 0.0.1
-        * @param DOMNode $node The node for the tag to check.
-        * @return bool true if the tag match removal criteria, otherwise false.
+        * @param $tagString The string to test.
+        * @return true if $tagString is a start tag, else false.
+        */
+
+       private static function isStartTag( $tagString ) {
+               return substr( $tagString, 0, 2 ) != '</' &&
+                       substr( $tagString, -2 ) != '/>';
+       }
+
+       /**
+        * Test if a string is an end tag.
+        *
+        * @since 0.0.1
+        * @param $tagString The string to test.
+        * @return true if $tagString is an end tag, else false.
+        */
+
+       private static function isEndTag( $tagString ) {
+               return substr( $tagString, 0, 2 ) == '</';
+       }
+
+       /**
+        * Test if a string is an empty element tag.
+        *
+        * @since 0.0.1
+        * @param $tagString The string to test.
+        * @return true if $tagString is an empty element tag, else false.
+        */
+
+       private static function isEmptyElementTag( $tagString ) {
+               return substr( $tagString, -2 ) == '/>';
+       }
+
+       /**
+        * Get the index in $tags of the tag that starts the element which
+        * ends with $tagString.
+        *
+        * Traverses $tags backwards and tests if start tags are of the
+        * same type as the one in $tagString.
+        *
+        * @since 0.0.1
+        * @param array $tags Tag array, as returned from getTags().
+        * @param string $tagString the end tag to find start tag for, as
+        *  HTML string.
+        * @return int The index in $tags of the start tag found.
+        */
+
+       private static function getCorrespondingStartTagIndex(
+               $tags,
+               $tagString
+       ) {
+               for ( $i = count( $tags ) - 1; $i >= 0; $i -- ) {
+                       $tag = $tags[$i];
+                       if ( is_array( $tag ) && count( $tag ) == 1 ) {
+                               // Make sure the tag to test is an array, i.e. 
a start
+                               // tag, and that it doesn't have an end tag 
already.
+                               $startTagType = self::getTagName( 
$tag[0]['string'] );
+                               $endTagType = self::getTagName( $tagString );
+                               if ( $startTagType == $endTagType ) {
+                                       return $i;
+                               }
+                       }
+               }
+       }
+
+       /**
+        * Get the tag name from a tag string.
+        *
+        * @since 0.0.1
+        * @param string $tagString The tag as string.
+        * @return string The name of the tag in $tagString.
+        */
+
+       private static function getTagName( $tagString ) {
+               $nameMatch = null;
+               preg_match( '!</?([^ />]+)( />)?!', $tagString, $nameMatch );
+               $tagName = $nameMatch[1];
+               return $tagName;
+       }
+
+       /**
+        * Recursively add content as either CleanedTags or strings.
+        *
+        * Goes through all the child nodes of $node and adds the
+        * corresponding content. If a child is a tag, it's added as a
+        * CleanedTag of the appropriate type (Start, End or Empty). If a
+        * child is a text node, the text is added as a string.
+        *
+        * @since 0.0.1
+        * @param array $content The resulting array of CleanedTags and
+        *  strings.
+        * @param DOMNode $node The top node to add from.
+        * @param string $source The HTML string that DOM is generated
+        *  from. Used for retrieveing element contents.
+        * @param array $tags Tag array, as generated by getTags().
+        * @param int $tagIndex The index of the next tag, from $tags.
+        */
+
+       private static function addContent(
+               &$content,
+               $node,
+               $source,
+               $tags,
+               &$tagIndex
+       ) {
+               $startTagArray = null;
+               $endTagArray = null;
+               if ( $tagIndex >= 0 ) {
+                       // Don't add the dummy tag.
+                       if ( is_array( $tags[$tagIndex] ) ) {
+                               // If an item in $tags is an array, it holds 
arrays
+                               // for start and end tags.
+                               $startTagArray = $tags[$tagIndex][0];
+                               $endTagArray = $tags[$tagIndex][1];
+                               $cleanedStartTag = self::addStartTag(
+                                       $content,
+                                       $startTagArray['string']
+                               );
+                       } else {
+                               // If the tag is empty, just add it and return, 
since
+                               // there can't any child nodes.
+                               $emptyElementTagString = $tags[$tagIndex];
+                               self::addEmptyElementTag( $content, 
$emptyElementTagString );
+                               return;
+                       }
+               }
+               if ( self::matchesRemove( $node ) ) {
+                       // When a tag is removed, skip forward a number of tags
+                       // equal to the number of nodes under that tag.
+                       $tagIndex += self::getNumberOfDescendants( $node );
+                       $cleanedStartTag->contentLength = 
self::getContentLength(
+                               $startTagArray,
+                               $endTagArray,
+                               $source
+                       );
+               } else {
+                       self::addChildren(
+                               $content,
+                               $node,
+                               $source,
+                               $tags,
+                               $tagIndex
+                       );
+               }
+               if ( $endTagArray != null ) {
+                       array_push(
+                               $content,
+                               new CleanedEndTag( $endTagArray['string'] )
+                       );
+               }
+       }
+
+       /**
+        * Add a CleanedStartTag to an array.
+        *
+        * @since 0.0.1
+        * @param array $content The array that the tag representation is
+        *  added to.
+        * @param string $tagString A string representation of a tag.
+        * @return CleanedStartTag The added tag representation.
+        */
+
+       private static function addStartTag( &$content, $tagString ) {
+               $cleanedStartTag = new CleanedStartTag( $tagString );
+               array_push( $content, $cleanedStartTag );
+               return $cleanedStartTag;
+       }
+
+       /**
+        * Add an CleanedEmptyElementTag to an array.
+        *
+        * @since 0.0.1
+        * @param array $content The array that the tag representation is
+        *  added to.
+        * @param string $tagString String representation of a tag.
+        */
+
+       private static function addEmptyElementTag( &$content, $tagString ) {
+               $cleanedTag = new CleanedEmptyElementTag( $tagString );
+               array_push( $content, $cleanedTag );
+       }
+
+       /**
+        * Get the number of nodes that are descendants of a given node.
+        *
+        * @since 0.0.1
+        * @param DOMNode $node The node to get number of descendants of.
+        * @return int The number of decendants of $node.
+        */
+
+       private static function getNumberOfDescendants( $node ) {
+               if ( !$node->hasChildNodes() ) {
+                       return 0;
+               }
+               $numberOfDescendants = 0;
+               foreach ( $node->childNodes as $child ) {
+                       if ( $child->nodeType != XML_TEXT_NODE ) {
+                               $numberOfDescendants +=
+                                       1 + self::getNumberOfDescendants( 
$child );
+                       }
+               }
+               return $numberOfDescendants;
+       }
+
+       /**
+        * Get the length of the element content.
+        *
+        * The element content is the string between the start tag and the
+        * end tag, excluding the tags themselves.
+        *
+        * @since 0.0.1
+        * @param array $startTagArray Array containing string and
+        *  position for the start tag.
+        * @param array $endTagArray Array containing string and
+        *  position for the end tag.
+        * @param string $source The HTML string that DOM is generated
+        *  from. Used for retrieveing element contents.
+        */
+
+       private static function getContentLength(
+               $startTagArray,
+               $endTagArray,
+               $source
+       ) {
+               $elementContentStartPosition =
+                       $startTagArray['position'] + strlen( 
$startTagArray['string'] );
+               $length = $endTagArray['position'] - 
$elementContentStartPosition;
+               $elementContentString =
+                       substr( $source, $elementContentStartPosition, $length 
);
+               return strlen( $elementContentString );
+       }
+
+       /**
+        * Add content for child nodes to an array.
+        *
+        * @since 0.0.1
+        * @param array $content The array that the children are added to.
+        * @param DOMNode $node Add content for the children of this node.
+        * @param string $source The HTML string that DOM is generated
+        *  from. Used for retrieveing element contents.
+        * @param array $tags Tag array, as generated by getTags().
+        * @param int $tagIndex The index of the next tag, from $tags.
+        */
+
+       private static function addChildren(
+               &$content,
+               $node,
+               $source,
+               $tags,
+               &$tagIndex
+       ) {
+               foreach ( $node->childNodes as $child ) {
+                       if ( $child->nodeType == XML_TEXT_NODE ) {
+                               array_push( $content, $child->textContent );
+                       } else {
+                               // Nodes are handled even if their parents are
+                               // removed, to not get the DOM nodes out of 
sync with
+                               // $tags.
+                               $tagIndex += 1;
+                               self::addContent(
+                                       $content,
+                                       $child,
+                                       $source,
+                                       $tags,
+                                       $tagIndex
+                               );
+                       }
+               }
+       }
+
+       /**
+        * Check if a node matches criteria for removal.
+        *
+        * The node is compared to the removal criteria from the
+        * configuration, to determine if it should be removed completely.
+        *
+        * @since 0.0.1
+        * @param DOMNode $node The node to check.
+        * @return bool true if the node match removal criteria, otherwise
+        *  false.
         */
 
        private static function matchesRemove( $node ) {
@@ -80,12 +435,12 @@
                        // The node name isn't found in the removal list.
                        return false;
                }
-               $removeCriteria = $wgWikispeechRemoveTags[ $node->nodeName ];
+               $removeCriteria = $wgWikispeechRemoveTags[$node->nodeName];
                if ( $removeCriteria === true ) {
                        // Node name is found and there are no extra criteria.
                        return true;
                }
-               if ( self::nodeHasClass( $node, $removeCriteria[ 'class' ] ) ) {
+               if ( self::nodeHasClass( $node, $removeCriteria['class'] ) ) {
                        // Node name and class name match.
                        return true;
                }
@@ -95,14 +450,15 @@
        /**
         * Check if a node has a class attribute, containing a string.
         *
-        * Since this is for checking HTML tag classes, the class attribute, if
-        * present, is assumed to be a string of substrings, sepparated by 
spaces.
+        * Since this is for checking HTML tag classes, the class
+        * attribute, if present, is assumed to be a string of substrings,
+        * sepparated by spaces.
         *
         * @since 0.0.1
         * @param DOMNode $node The node to check.
         * @param string $className The name of the class to check for.
-        * @return bool true if the node's class attribute contain $className,
-        * otherwise false.
+        * @return bool true if the node's class attribute contain
+        *  $className, otherwise false.
         */
 
        private static function nodeHasClass( $node, $className ) {
diff --git a/includes/HtmlGenerator.php b/includes/HtmlGenerator.php
index 6d94604..8278fd0 100644
--- a/includes/HtmlGenerator.php
+++ b/includes/HtmlGenerator.php
@@ -6,71 +6,116 @@
  * @license GPL-2.0+
  */
 
+require_once 'CleanedTag.php';
+
 class HtmlGenerator {
 
        /**
-        * Generate an HTML string for a sequence of utternaces. Utterance tags
-        * look like this:
-        * <utterance id="utterance-0><text>Utterance 
string.</text><audio></audio></utterance>
-        * The <text> and <audio> tags are used to request audio from the TTS
-        * server and store the response.
+        * Create an HTML string for a sequence of utterances.
         *
         * @since 0.0.1
-        * @param array $utterances The utterance strings to generate HTML from.
-        * @return string An HTML string containing the <utterance> tags, 
wrapped
-        *      in an <utterances> tag.
+        * @param array $segments Array of segments to generate utterances
+        *  from.
+        * @return string An HTML string containing the <utterance> tags,
+        *  wrapped in an <utterances> tag.
         */
 
-       public static function generateUtterancesHtml( $utterances ) {
-               if ( count( $utterances ) ) {
+       public static function createUtterancesHtml( $segments ) {
+               if ( count( $segments ) ) {
                        $dom = new DOMDocument();
                        $utterancesNode = $dom->createElement( 'utterances' );
                        // Hide the content of the utterance elements.
                        $utterancesNode->setAttribute( 'hidden', '' );
                        $index = 0;
-                       foreach ( $utterances as $utteranceString ) {
-                               $utteranceNode = self::generateUtteranceElement(
+                       foreach ( $segments as $segment ) {
+                               $utteranceElement = 
self::createUtteranceElement(
                                        $dom,
-                                       $utteranceString,
+                                       $segment,
                                        $index
                                );
-                               $utterancesNode->appendChild( $utteranceNode );
+                               $utterancesNode->appendChild( $utteranceElement 
);
                                $index += 1;
                        }
-                       $utternacesHtml = urldecode( $dom->saveHTML( 
$utterancesNode ) );
+                       $utternacesHtml = $dom->saveHTML( $utterancesNode );
                        return $utternacesHtml;
                }
        }
 
        /**
-        * Create an utterance element, which has child elements for the 
utterance
-        * string and audio.
+        * Create an utterance element.
+        *
+        * The element looks like this in HTML:
+        * <utterance id="utterance-0>
+        *   <content>
+        *     Utterance string with <cleaned-tag>not removed tag</cleaned-tag>.
+        *   </content>
+        * </utterance>
+        *
+        * The id is a zero based index, used to find the adjacent
+        * utterances, when next or previous utterance should be played.
+        *
+        * The content element contains a representation of the HTML that
+        * was used to generate this utterance. Text nodes are the same as
+        * in the original HTML. Elements are represented by cleaned-tag
+        * elements, whose contents are the tags from the original HTML,
+        * excluding < and >.
         *
         * @since 0.0.1
         * @param DOMDocument $dom The DOMDocument to use for creating the
-        *      elements.
-        * @param string $utteranceString The string to add to the text element,
-        *      which is later sent to the TTS server.
-        * @param int $index The index of the element, used for giving it an id.
-        *      Later used for playing the utterances in the correct order.
+        *  element.
+        * @param array $segment An array with position and content as an
+        *  array of CleanedTags and strings.
+        * @param int $index The index of the element, used for giving it
+        *  an id. Later used for playing the utterances in the correct
+        *  order.
         * @return DOMElement The resulting utterance element.
         */
 
-       private static function generateUtteranceElement(
-               $dom,
-               $utteranceString,
-               $index
-       ) {
+       private static function createUtteranceElement( $dom, $segment, $index 
) {
                $utteranceElement = $dom->createElement( 'utterance' );
                $utteranceElement->setAttribute( 'id', "utterance-$index" );
-               $textNode = $dom->createElement(
-                       'text',
-                       // URL encoding (and later decoding) if required due to
-                       // strings containing # not being written otherwise.
-                       urlencode( $utteranceString ) );
-               $utteranceElement->appendChild( $textNode );
-               $audioNode = $dom->createElement( 'audio' );
-               $utteranceElement->appendChild( $audioNode );
+               $utteranceElement->setAttribute(
+                       'position',
+                       $segment['position']
+               );
+               $contentElement = self::createContentElement(
+                       $dom,
+                       $segment['content']
+               );
+               $utteranceElement->appendChild( $contentElement );
                return $utteranceElement;
        }
+
+       /**
+        * Create a content element from a content array.
+        *
+        * CleanedTags are represented as cleaned-tag elements, strings as
+        * text nodes.
+        *
+        * @since 0.0.1
+        * @param array $content An array of CleanedTags and strings.
+        * @return DOMNode A content element.
+        */
+
+       private static function createContentElement( $dom, $content ) {
+               $contentElement = $dom->createElement( 'content' );
+               foreach ( $content as $part ) {
+                       if ( $part instanceof CleanedTag ) {
+                               // Remove the < and > from the tag string to 
not have to
+                               // decode them later.
+                               $text = substr( $part->tagString, 1, -1 );
+                               $cleanedTagElement = $dom->createElement( 
'cleaned-tag', $text );
+                               if (
+                                       $part instanceof CleanedStartTag &&
+                                       $part->contentLength
+                               ) {
+                                       $cleanedTagElement->setAttribute( 
'removed', $part->contentLength );
+                               }
+                               $contentElement->appendChild( 
$cleanedTagElement );
+                       } else {
+                               $contentElement->appendChild( 
$dom->createTextNode( $part ) );
+                       }
+               }
+               return $contentElement;
+       }
 }
diff --git a/includes/Segmenter.php b/includes/Segmenter.php
index f9af191..a1fea80 100644
--- a/includes/Segmenter.php
+++ b/includes/Segmenter.php
@@ -6,49 +6,118 @@
  * @license GPL-2.0+
  */
 
+require_once 'CleanedTag.php';
+
 class Segmenter {
 
        /**
-        * Divide a string into segments, where each segment is a sentence. A
-        * sentence is here defined as a number of tokens ending with a dot 
(full
-        * stop) or a newline. Headings are also considered sentences.
+        * Divide a cleaned content array into segments, one for each sentence.
+        *
+        * A segment is an array with the keys "content" and "position". 
Content is
+        * an array of CleanedTags and strings. Position is the start
+        * position, in the HTML, for the first node in content, i.e. the start
+        * position of the segment.
+        *
+        * A sentence is here defined as a number of tokens ending with a dot 
(full
+        * stop). Headings are also considered sentences.
         *
         * @since 0.0.1
-        * @param string $text A string to segment.
-        * @return array The segments found.
+        * @param array $cleanedContent An array of cleaned content, as 
returned by
+        *  Cleaner::cleanHtml().
+        * @return array An array of segments, each containing the nodes in that
+        *  segment and the start position in the HTML.
         */
 
-       public static function segmentSentences( $text ) {
-               $matches = [];
-               // Find the indices of all characters that may be sentence 
final.
-               preg_match_all(
-                       "/(.|\n)/",
-                       $text,
-                       $matches,
-                       PREG_OFFSET_CAPTURE );
-               $start = 0;
+       public static function segmentSentences( $cleanedContent ) {
                $segments = [];
-               foreach ( $matches[ 0 ] as $match ) {
-                       $index = $match[ 1 ];
-                       if ( self::isSentenceFinal( $text, $index ) ) {
-                               $length = $index - $start + 1;
-                               $segment = trim( substr( $text, $start, $length 
) );
-                               if ( $segment != '' ) {
-                                       // Strings that are only whitespaces 
are not considered
-                                       // sentences.
-                                       array_push( $segments, $segment );
-                                       // Start the next sentence after the 
sentence final
-                                       // character.
-                                       $start = $index + 1;
-                               }
+               $currentSegment = [
+                       'position' => 0,
+                       'content' => []
+               ];
+               foreach ( $cleanedContent as $content ) {
+                       if ( $content instanceof CleanedTag ) {
+                               // Non-text nodes are always added to the 
current segment, as
+                               // they can't contain segment breaks.
+                               array_push( $currentSegment['content'], 
$content );
+                       } else {
+                               self::addSegments(
+                                       $segments,
+                                       $currentSegment,
+                                       $content
+                               );
                        }
+               }
+               if ( $currentSegment['content'] ) {
+                       // Add the last segment, unless it's empty.
+                       array_push( $segments, $currentSegment );
                }
                return $segments;
        }
 
        /**
-        * Tests if a character is at the end of a sentence. Dots in 
abbreviations
-        * should only be counted when they also are sentence final. For 
example:
+        * Add segments for a string.
+        *
+        * Looks for sentence final string (strings which a sentence ends
+        * with). When a sentence final string is found, it's sentence is
+        * added to the $currentSegment, which in turn is added to
+        * $segments. An empty array is created as the new
+        * $currentSegment.
+        *
+        * When the end of string is reached, the remaining string is
+        * added to $currentSegment. Subsequent segment parts will be
+        * added to this semgent.
+        *
+        * @since 0.0.1
+        * @param array $segments The segment array to add new segments to.
+        * @param array $currentSegment The segment under construction, to which
+        *  the first found string segment will be added.
+        * @param string $text The string to segment.
+        */
+
+       private static function addSegments(
+               &$segments,
+               &$currentSegment,
+               $text
+       ) {
+               // Find the indices of all characters that may be sentence 
final.
+               preg_match_all(
+                       "/\./",
+                       $text,
+                       $matches,
+                       PREG_OFFSET_CAPTURE
+               );
+               $position = 0;
+               foreach ( $matches[0] as $match ) {
+                       $sentenceFinalPosition = $match[1];
+                       if ( self::isSentenceFinal( $text, 
$sentenceFinalPosition ) ) {
+                               $length = $sentenceFinalPosition - $position + 
1;
+                               $segmentText = substr( $text, $position, 
$length );
+                               if ( trim( $segmentText ) != '' ) {
+                                       // Don't add segments with only 
whitespaces.
+                                       array_push( $currentSegment['content'], 
$segmentText );
+                                       $position = $sentenceFinalPosition + 1;
+                                       array_push( $segments, $currentSegment 
);
+                                       $nextSegmentPosition = 
$currentSegment['position'] +
+                                               self::getSegmentLength( 
$currentSegment['content'] );
+                                       $currentSegment = [
+                                               'position' => 
$nextSegmentPosition,
+                                               'content' => []
+                                       ];
+                               }
+                       }
+               }
+               $remainder = substr( $text, $position );
+               if ( $remainder ) {
+                       // Add any remaining part of the string.
+                       array_push( $currentSegment['content'], $remainder );
+               }
+       }
+
+       /**
+        * Test if a character is at the end of a sentence.
+        *
+        * Dots in abbreviations should only be counted when they also are 
sentence
+        * final. For example:
         * "Monkeys, penguins etc.", but not "Monkeys e.g. baboons".
         *
         * @since 0.0.1
@@ -58,26 +127,23 @@
         */
 
        private static function isSentenceFinal( $string, $index ) {
-               $character = $string[ $index ];
+               $character = $string[$index];
                $nextCharacter = null;
                if ( strlen( $string ) > $index + 1 ) {
-                       $nextCharacter = $string[ $index + 1 ];
+                       $nextCharacter = $string[$index + 1];
                }
                $characterAfterNext = null;
                if ( strlen( $string ) > $index + 2 ) {
-                       $characterAfterNext = $string[ $index + 2 ];
+                       $characterAfterNext = $string[$index + 2];
                }
-               if ( $character == "\n" ) {
-                       // A newline is always sentence final.
-                       return true;
-               } elseif (
+               if (
                        $character == '.' &&
-                       $nextCharacter == ' ' && self::isUpper( 
$characterAfterNext ) ||
-                       $nextCharacter == "\n" ||
-                       $nextCharacter == ''
+                       ( $nextCharacter == ' ' && self::isUpper( 
$characterAfterNext ) ||
+                       $nextCharacter == '' ||
+                       $nextCharacter == "\n" )
                ) {
                        // A dot is sentence final if it's followed by a space 
and a
-                       // capital letter, at the end of line or at the end of 
string.
+                       // capital letter or at the end of string or line.
                        return true;
                } else {
                        return false;
@@ -85,11 +151,11 @@
        }
 
        /**
-        * Tests if a string is upper case.
+        * Test if a string is upper case.
         *
         * @since 0.0.1
         * @param string $string The string to test.
-        * @return bool True if the entire string is upper case, else false.
+        * @return bool true if the entire string is upper case, else false.
         */
 
        private static function isUpper( $string ) {
@@ -97,21 +163,22 @@
        }
 
        /**
-        * Split a string by newline.
+        * Calculate the length of a segment, as it is represented in HTML.
         *
         * @since 0.0.1
-        * @param string $text A string to segment.
-        * @return array The segments found. Segments only containing 
whitespaces
-        * are discarded.
+        * @param array $segment An array of nodes.
+        * @return int The combinded length of the HTML of the nodes in 
$segment.
         */
 
-       public static function segmentParagraphs( $text ) {
-               $segments = [];
-               foreach ( explode( "\n", $text ) as $segment ) {
-                       if ( strlen( trim( $segment ) ) > 0 ) {
-                               array_push( $segments, $segment );
+       private static function getSegmentLength( $segment ) {
+               $length = 0;
+               foreach ( $segment as $content ) {
+                       if ( $content instanceof CleanedTag ) {
+                               $length += $content->getLength();
+                       } else {
+                               $length += strlen( $content );
                        }
                }
-               return $segments;
+               return $length;
        }
 }
diff --git a/modules/ext.wikispeech.js b/modules/ext.wikispeech.js
index a416bf5..b7679ac 100644
--- a/modules/ext.wikispeech.js
+++ b/modules/ext.wikispeech.js
@@ -14,9 +14,7 @@
                 */
 
                this.addPlayStopButton = function () {
-                       var $playStopButton;
-
-                       $playStopButton = $( '<button></button>' )
+                       var $playStopButton = $( '<button></button>' )
                                .attr( 'id', 'ext-wikispeech-play-stop-button' )
                                .addClass( 'ext-wikispeech-play' );
                        $( '#firstHeading' ).append( $playStopButton );
@@ -42,10 +40,8 @@
                 */
 
                this.play = function () {
-                       var $playStopButton;
-
+                       var $playStopButton = $( 
'#ext-wikispeech-play-stop-button' );
                        self.playUtterance( $( '#utterance-0' ) );
-                       $playStopButton = $( '#ext-wikispeech-play-stop-button' 
);
                        $playStopButton.removeClass( 'ext-wikispeech-play' );
                        $playStopButton.addClass( 'ext-wikispeech-stop' );
                };
@@ -81,11 +77,9 @@
                 */
 
                this.stop = function () {
-                       var $playStopButton;
-
+                       var $playStopButton = $( 
'#ext-wikispeech-play-stop-button' );
                        self.stopUtterance( $currentUtterance );
                        $currentUtterance = $();
-                       $playStopButton = $( '#ext-wikispeech-play-stop-button' 
);
                        $playStopButton.removeClass( 'ext-wikispeech-stop' );
                        $playStopButton.addClass( 'ext-wikispeech-play' );
                };
@@ -98,9 +92,7 @@
                 */
 
                this.addSkipAheadSentenceButton = function () {
-                       var $skipAheadSentenceButton;
-
-                       $skipAheadSentenceButton = $( '<button></button>' )
+                       var $skipAheadSentenceButton = $( '<button></button>' )
                                .attr( 'id', 
'ext-wikispeech-skip-ahead-sentence-button' )
                                .addClass( 'ext-wikispeech-skip-ahead-sentence' 
);
                        $( '#firstHeading' ).append( $skipAheadSentenceButton );
@@ -116,9 +108,7 @@
                 */
 
                this.skipAheadUtterance = function () {
-                       var $nextUtterance;
-
-                       $nextUtterance = self.getNextUtterance( 
$currentUtterance );
+                       var $nextUtterance = self.getNextUtterance( 
$currentUtterance );
                        if ( $nextUtterance.length ) {
                                self.playUtterance( $nextUtterance );
                        } else {
@@ -131,14 +121,11 @@
                 */
 
                this.addKeyboardShortcuts = function () {
-                       var shortcuts;
-
+                       var shortcuts = mw.config.get( 
'wgWikispeechKeyboardShortcuts' );
                        $( document ).keydown( function ( event ) {
-                               shortcuts = mw.config.get( 
'wgWikispeechKeyboardShortcuts' );
                                if ( self.eventMatchShortcut( event, 
shortcuts.playStop ) ) {
                                        self.playOrStop();
-                               }
-                               if ( self.eventMatchShortcut(
+                               } else if ( self.eventMatchShortcut(
                                        event,
                                        shortcuts.skipAheadUtterance )
                                ) {
@@ -212,7 +199,7 @@
                 *
                 * @param $utterance The original utterance.
                 * @return The utterance after the original utterance. Empty 
object if
-                *      $utterance isn't a valid utterance.
+                *  $utterance isn't a valid utterance.
                 */
 
                this.getNextUtterance = function ( $utterance ) {
@@ -233,8 +220,7 @@
                /**
                 * Request audio for an utterance.
                 *
-                * When the response is received, set the audio URL as the 
source for
-                * the utterance's audio element.
+                * Adds audio and token elements when the response is received.
                 *
                 * @param $utterance The utterance to load audio for.
                 */
@@ -242,14 +228,23 @@
                this.loadAudio = function ( $utterance ) {
                        var $audio, text, audioUrl;
 
-                       $audio = $utterance.children( 'audio' );
+                       $audio = $( '<audio></audio>' ).appendTo( $utterance );
                        mw.log( 'Loading audio for: ' + $utterance.attr( 'id' ) 
);
-                       text = $utterance.children( 'text' ).text();
+                       // Get the combined string of the text nodes only, i.e. 
not from
+                       // the cleaned tag.
+                       text = $utterance.children( 'content' 
).contents().filter(
+                               function () {
+                                       // Filter text nodes. Not using 
Node.TEXT_NODE to
+                                       // support IE7.
+                                       return this.nodeType === 3;
+                               }
+                       ).text();
                        self.requestTts( text, function ( response ) {
                                audioUrl = response.audio;
                                mw.log( 'Setting url for ' + $utterance.attr( 
'id' ) + ': ' +
-                                               audioUrl );
+                                       audioUrl );
                                $audio.attr( 'src', audioUrl );
+                               self.addTokenElements( $utterance, 
response.tokens );
                        } );
                        $utterance.prop( 'requested', true );
                };
@@ -267,7 +262,7 @@
                 *
                 * @param {string} text The utterance string to send in the 
request.
                 * @param {Function} callback Function to be called when a 
response
-                *      is received.
+                *  is received.
                 */
 
                this.requestTts = function ( text, callback ) {
@@ -289,12 +284,118 @@
                                // jscs:enable 
requireCamelCaseOrUpperCaseIdentifiers
                        } );
                        request.onload = function () {
+                               mw.log( 'Response received: ' + 
request.responseText );
                                response = JSON.parse( request.responseText );
                                callback( response );
                        };
                        mw.log( 'Sending request: ' + serverUrl + '?' + 
parameters );
                        request.send( parameters );
                };
+
+               /**
+                * Add token elements to an utterance element.
+                *
+                * Adds a tokens element and populate it with token elements.
+                *
+                * @param $utterance The jQuery object to add tokens to.
+                * @param tokens Array of tokens from a server response, where 
each
+                *  token is an object. For these objects, the property "orth" 
is the
+                *  string used by the TTS to generate audio for the token.
+                */
+
+               this.addTokenElements = function ( $utterance, tokens ) {
+                       var position, $tokensElement, $content, firstTokenIndex,
+                               removedLength;
+
+                       // The character position in the original HTML. 
Starting at the
+                       // position of the utterance, since that's the earliest 
a child
+                       // token can appear.
+                       position = parseInt( $utterance.attr( 'position' ), 10 
);
+                       $tokensElement = $( '<tokens></tokens>' ).appendTo( 
$utterance );
+                       $content = $utterance.children( 'content' );
+                       firstTokenIndex = 0;
+                       mw.log( 'Adding tokens to ' + $utterance.attr( 'id' ) + 
':' );
+                       $content.contents().each( function ( i, element ) {
+                               if ( element.tagName === 'CLEANED-TAG' ) {
+                                       removedLength = element.getAttribute( 
'removed' );
+                                       if ( removedLength !== null ) {
+                                               position += parseInt( 
removedLength, 10 );
+                                       }
+                                       // Advance position two steps extra for 
the < and >,
+                                       // that were stripped from the tag at 
an earlier stage.
+                                       position += 2;
+                               } else {
+                                       // firstTokenIndex is the index, in 
tokens, of the first
+                                       // token we haven't created an element 
for.
+                                       firstTokenIndex = 
self.addTokensForTextElement(
+                                               tokens,
+                                               element,
+                                               position,
+                                               $tokensElement,
+                                               firstTokenIndex
+                                       );
+                               }
+                               position += element.textContent.length;
+                       } );
+               };
+
+               /**
+                * Add a token element for each token that match a substring of 
the
+                * given text element.
+                *
+                * Goes through textElement, finds substrings matching tokens 
and
+                * creates token elements for these. The position for the token
+                * elements is the substring position plus the position of 
textElement.
+                * When a token can no longer be found, the index of that token 
is
+                * returned to remember what to start looking for in the next 
text
+                * element.
+                *
+                * @param tokens Array of tokens from a server response, where 
each
+                *  token is an object. For these objects, the property "orth" 
is the
+                *  string used by the TTS to generate audio for the token.
+                * @param textElement The text element to match tokens against.
+                * @param {int} startPosition The position of the original text
+                *  element.
+                * @param $tokensElement Element which token elements are added 
to.
+                * @param {int} firstTokenIndex The index of the first token in 
tokens
+                *  to search for.
+                * @return {int} The index of the first token that wasn't found.
+                */
+
+               this.addTokensForTextElement = function (
+                       tokens,
+                       textElement,
+                       startPosition,
+                       $tokensElement,
+                       firstTokenIndex
+               ) {
+                       var positionInElement, matchingPosition, 
tokenPositionInHtml,
+                               orthographicToken, i, token;
+
+                       positionInElement = 0;
+                       for ( i = firstTokenIndex; i < tokens.length; i++ ) {
+                               token = tokens[ i ];
+                               orthographicToken = token.orth;
+                               // Look for the token in the remaining string.
+                               matchingPosition =
+                                       textElement.nodeValue.slice( 
positionInElement )
+                                       .indexOf( orthographicToken );
+                               if ( matchingPosition === -1 ) {
+                                       // The token wasn't found in this 
element. Stop looking for
+                                       // more and return the index of the 
token.
+                                       return i;
+                               }
+                               tokenPositionInHtml = startPosition + 
positionInElement +
+                                       matchingPosition;
+                               mw.log( '  "' + orthographicToken + '", 
position: ' +
+                                       tokenPositionInHtml );
+                               $( '<token></token>' )
+                                       .text( orthographicToken )
+                                       .attr( 'position', tokenPositionInHtml )
+                                       .appendTo( $tokensElement );
+                               positionInElement += orthographicToken.length;
+                       }
+               };
        }
 
        mw.wikispeech = {};
diff --git a/tests/phpunit/CleanerTest.php b/tests/phpunit/CleanerTest.php
index 1272949..94c67fc 100644
--- a/tests/phpunit/CleanerTest.php
+++ b/tests/phpunit/CleanerTest.php
@@ -7,6 +7,7 @@
  */
 
 require_once __DIR__ . '/../../includes/Cleaner.php';
+require_once 'Util.php';
 
 class CleanerTest extends MediaWikiTestCase {
        protected function setUp() {
@@ -16,14 +17,19 @@
                        'table' => true,
                        'sup' => [ 'class' => 'reference' ],
                        'editsection' => true,
-                       'h2' => false
+                       'h2' => false,
+                       'del' => true
                ];
        }
 
        public function testCleanTags() {
-               $markedUpText = '<i>Blonde on Blonde</i>';
-               $expectedText = 'Blonde on Blonde';
-               $this->assertTextCleaned( $expectedText, $markedUpText );
+               $markedUpText = '<i>Element content</i>';
+               $expectedCleanedContent = [
+                       Util::createStartTag( '<i>' ),
+                       'Element content',
+                       new CleanedEndTag( '</i>' )
+               ];
+               $this->assertTextCleaned( $expectedCleanedContent, 
$markedUpText );
        }
 
        /**
@@ -35,89 +41,335 @@
         * should not be altered.
         *
         * @since 0.0.1
-        * @param string $expectedText The string that is the expected output
-        * from the function named by $function.
+        * @param array $expectedCleanedContent The content that is the expected
+        *  output.
         * @param string $markedUpText The string that contains the markup
-        * that should be cleaned. Used as input to the function named by
-        * $function.
+        *  that should be cleaned
         */
 
-       private function assertTextCleaned( $expectedText, $markedUpText ) {
+       private function assertTextCleaned(
+               $expectedCleanedContent,
+               $markedUpText
+       ) {
                $this->assertEquals(
-                       $expectedText,
+                       $expectedCleanedContent,
                        Cleaner::cleanHtml( $markedUpText )
                );
-               $this->assertEquals( 'prefix' . $expectedText . 'suffix',
-                       Cleaner::cleanHtml( 'prefix' . $markedUpText . 'suffix' 
) );
-               $this->assertEquals( $expectedText . 'infix' . $expectedText,
-                       Cleaner::cleanHtml( $markedUpText . 'infix' . 
$markedUpText ) );
-               $this->assertEquals( 'A string without any fancy markup.',
-                       Cleaner::cleanHtml( 'A string without any fancy 
markup.' ) );
+               $this->assertWithPrefixAndSuffix(
+                       $expectedCleanedContent,
+                       $markedUpText
+               );
+               $this->assertWithInfix(
+                       $expectedCleanedContent,
+                       $markedUpText
+               );
+       }
+
+       /**
+        * Make sure that the correct content is given when preceded and
+        * followed by text.
+        *
+        * Pre- and suffix strings are concatenated to the first and last
+        * part respectively, of the expected content if they are
+        * strings. If they are CleanedTags, they are added as new parts.
+        *
+        * @since 0.0.1
+        * @param array $expectedCleanedContent The content that is the expected
+        *  output, excluding pre- and suffix.
+        * @param string $markedUpText The string that contains the markup
+        *  that should be cleaned
+        */
+
+       private function assertWithPrefixAndSuffix(
+               $expectedCleanedContent,
+               $markedUpText
+       ) {
+               if ( is_string( $expectedCleanedContent[0] ) ) {
+                       $expectedCleanedContent[0] = 'prefix' . 
$expectedCleanedContent[0];
+               } else {
+                       array_unshift( $expectedCleanedContent, 'prefix' );
+               }
+               $lastIndex = count( $expectedCleanedContent ) - 1;
+               if ( is_string( $expectedCleanedContent[$lastIndex] ) ) {
+                       $expectedCleanedContent[$lastIndex] .= 'suffix';
+               } else {
+                       array_push( $expectedCleanedContent, 'suffix' );
+               }
+               $this->assertEquals(
+                       $expectedCleanedContent,
+                       Cleaner::cleanHtml( 'prefix' . $markedUpText . 'suffix' 
)
+               );
+       }
+
+       /**
+        * Make sure that the correct content is given when the marked up
+        * text is repeated, with text in between.
+        *
+        * If the first instance of the expected content end with a
+        * string, the infix is added after that. If the second instance
+        * starts with a string, the infix is added before that. If both
+        * cases occur at the same time, the string between the instances
+        * will consist of the last string of first instance, infix and
+        * first string of second instance.
+        *
+        * @since 0.0.1
+        * @param array $expectedCleanedContent The content that will be
+        *  repeated to create the expected output.
+        * @param string $markedUpText The string that contains the markup
+        *  that should be cleaned
+        */
+
+       private function assertWithInfix(
+               $expectedCleanedContent,
+               $markedUpText
+       ) {
+               $infix = 'infix';
+               $firstContent = $expectedCleanedContent;
+               if ( is_string( $firstContent[0] ) ) {
+                       $adjacent = array_pop( $firstContent );
+                       $infix = $adjacent . $infix;
+               }
+               $secondContent = $expectedCleanedContent;
+               $lastIndex = count( $secondContent ) - 1;
+               if ( is_string( $expectedCleanedContent[$lastIndex] ) ) {
+                       $adjacent = array_shift( $secondContent );
+                       $infix .= $adjacent;
+               }
+               $this->assertEquals(
+                       array_merge( $firstContent, [ $infix ], $secondContent 
),
+                       Cleaner::cleanHtml( $markedUpText . 'infix' . 
$markedUpText )
+               );
+       }
+
+       public function testDontAlterStringWithoutMarkup() {
+               $markedUpText = 'A string without any fancy markup.';
+               $expectedCleanedContent = [ 'A string without any fancy 
markup.' ];
+               $this->assertEquals(
+                       $expectedCleanedContent,
+                       Cleaner::cleanHtml( $markedUpText )
+               );
        }
 
        public function testCleanNestedTags() {
-               $markedUpText = '<i><b>Blonde on Blonde</b></i>';
-               $expectedText = 'Blonde on Blonde';
-               $this->assertTextCleaned( $expectedText, $markedUpText );
+               $markedUpText = '<i><b>Nested content</b></i>';
+               $expectedCleanedContent = [
+                       Util::createStartTag( '<i>' ),
+                       Util::createStartTag( '<b>' ),
+                       'Nested content',
+                       new CleanedEndTag( '</b>' ),
+                       new CleanedEndTag( '</i>' )
+               ];
+               $this->assertTextCleaned( $expectedCleanedContent, 
$markedUpText );
        }
 
-       public function testCleanEmptyTags() {
-               $markedUpText = '<img alt="" src="image.png" />';
-               $expectedText = '';
-               $this->assertTextCleaned( $expectedText, $markedUpText );
+       public function testCleanEmptyElementTags() {
+               $markedUpText = '<br />';
+               $expectedCleanedContent = [
+                       new CleanedEmptyElementTag( '<br />' )
+               ];
+               $this->assertTextCleaned( $expectedCleanedContent, 
$markedUpText );
        }
 
-       public function testRemoveTagsAltogether() {
-               $markedUpText = '<table>Remove this table, please.</table>';
-               $expectedText = '';
-               $this->assertTextCleaned( $expectedText, $markedUpText );
+       public function testRemoveTags() {
+               $markedUpText = '<del>removed tag </del>';
+               $expectedCleanedContent = [
+                       Util::createStartTag( '<del>', 'removed tag ' ),
+                       new CleanedEndTag( '</del>' )
+               ];
+               $this->assertTextCleaned( $expectedCleanedContent, 
$markedUpText );
+       }
+
+       public function testDontAddCleanedTagsForTagsUnderRemovedTags() {
+               $markedUpText = '<del><i>nested removed tag</i></del>';
+               $expectedCleanedContent = [
+                       Util::createStartTag( '<del>', '<i>nested removed 
tag</i>' ),
+                       new CleanedEndTag( '</del>' )
+               ];
+               $this->assertTextCleaned( $expectedCleanedContent, 
$markedUpText );
+       }
+
+       public function testRemoveDoubleNestedTags() {
+               $markedUpText = '<del><i><b>double nested removed 
tag</b></i></del>';
+               $expectedCleanedContent = [
+                       Util::createStartTag(
+                               '<del>',
+                               '<i><b>double nested removed tag</i></u>'
+                       ),
+                       new CleanedEndTag( '</del>' )
+               ];
+               $this->assertTextCleaned( $expectedCleanedContent, 
$markedUpText );
        }
 
        public function testRemoveTagsWithCertainClass() {
-               $markedUpText = '<sup class="reference"><a>[1]</a>Also remove 
this.</sup>';
-               $expectedText = '';
-               $this->assertTextCleaned( $expectedText, $markedUpText );
+               $markedUpText = '<sup class="reference">Remove this.</sup>';
+               $expectedCleanedContent = [
+                       Util::createStartTag(
+                               '<sup class="reference">',
+                               'Remove this.'
+                       ),
+                       new CleanedEndTag( '</sup>' )
+               ];
+               $this->assertTextCleaned( $expectedCleanedContent, 
$markedUpText );
+       }
+
+       public function testDontRemoveTagsWithoutCertainClass() {
+               $markedUpText =
+                       '<sup>I am not a reference.</sup><sup 
class="not-a-reference">Neither am I.</sup>';
+               $expectedCleanedContent = [
+                       Util::createStartTag( '<sup>' ),
+                       'I am not a reference.',
+                       new CleanedEndTag( '</sup>' ),
+                       Util::createStartTag( '<sup class="not-a-reference">' ),
+                       'Neither am I.',
+                       new CleanedEndTag( '</sup>' )
+               ];
+               $this->assertTextCleaned( $expectedCleanedContent, 
$markedUpText );
        }
 
        public function testDontRemoveTagsWhichCriteriaAreFalse() {
                $markedUpText = '<h2>Contents</h2>';
-               $expectedText = 'Contents';
-               $this->assertTextCleaned( $expectedText, $markedUpText );
-       }
-
-       public function testDontRemoveTagsWithoutCertainClass() {
-               // @codingStandardsIgnoreStart
-               $markedUpText = '<sup>I am not a reference.</sup><sup 
class="not-a-reference">Neither am I.</sup>';
-               // @codingStandardsIgnoreEnd
-               $expectedText = 'I am not a reference.Neither am I.';
-               $this->assertTextCleaned( $expectedText, $markedUpText );
+               $expectedCleanedContent = [
+                       Util::createStartTag( '<h2>' ),
+                       'Contents',
+                       new CleanedEndTag( '</h2>' )
+               ];
+               $this->assertTextCleaned( $expectedCleanedContent, 
$markedUpText );
        }
 
        public function testHandleMultipleClasses() {
-               // @codingStandardsIgnoreStart
-               $markedUpText = '<sup class="reference another-class"><a 
href="#cite_note-Grayp5-1">[1]</a>Also remove this.</sup>';
-               // @codingStandardsIgnoreEnd
-               $expectedText = '';
-               $this->assertTextCleaned( $expectedText, $markedUpText );
+               $markedUpText =
+                       '<sup class="reference another-class">Remove 
this.</sup>';
+               $expectedCleanedContent = [
+                       Util::createStartTag(
+                               '<sup class="reference another-class">',
+                               'Remove this.'
+                       ),
+                       new CleanedEndTag( '</sup>' )
+               ];
+               $this->assertTextCleaned( $expectedCleanedContent, 
$markedUpText );
        }
 
        public function testCleanNestedTagsWhereSomeAreRemovedAndSomeAreKept() {
-               // @codingStandardsIgnoreStart
-               $markedUpText = '<h2><span class="mw-headline" 
id="Recording_sessions">Recording sessions</span><mw:editsection page="Test 
Page" section="1">Recording sessions *REMOVE THIS*</mw:editsection></h2>';
-               // @codingStandardsIgnoreEnd
-               $expectedText = 'Recording sessions';
-               $this->assertTextCleaned( $expectedText, $markedUpText );
+               $markedUpText = '<i><b>not removed</b><del>removed</del></i>';
+               $expectedCleanedContent = [
+                       Util::createStartTag( '<i>' ),
+                       Util::createStartTag( '<b>' ),
+                       'not removed',
+                       new CleanedEndTag( '</b>' ),
+                       Util::createStartTag( '<del>', 'removed' ),
+                       new CleanedEndTag( '</del>' ),
+                       new CleanedEndTag( '</i>' )
+               ];
+               $this->assertTextCleaned( $expectedCleanedContent, 
$markedUpText );
        }
 
        public function testHandleUtf8Characters() {
                $markedUpText = '—';
-               $expectedText = '—';
-               $this->assertTextCleaned( $expectedText, $markedUpText );
+               $expectedCleanedContent = [ '—' ];
+               $this->assertTextCleaned( $expectedCleanedContent, 
$markedUpText );
        }
 
        public function testHandleHtmlEntities() {
                $markedUpText = '6&#160;p.m';
-               $expectedText = '6 p.m';
-               $this->assertTextCleaned( $expectedText, $markedUpText );
+               $expectedCleanedContent = [ '6 p.m' ];
+               $this->assertTextCleaned( $expectedCleanedContent, 
$markedUpText );
+       }
+
+       public function testHandleNewlines() {
+               $markedUpText = "<i>Keep this newline\n</i>";
+               $expectedCleanedContent = [
+                       Util::createStartTag( '<i>' ),
+                       "Keep this newline\n",
+                       new CleanedEndTag( '</i>' )
+               ];
+               $this->assertTextCleaned( $expectedCleanedContent, 
$markedUpText );
+       }
+
+       public function testHandleEndTagFollowedByEmptyElementTag() {
+               $markedUpText = '<i>content</i><br />';
+               $expectedCleanedContent = [
+                       Util::createStartTag( '<i>' ),
+                       'content',
+                       new CleanedEndTag( '</i>' ),
+                       new CleanedEmptyElementTag( '<br />' )
+               ];
+               $this->assertTextCleaned( $expectedCleanedContent, 
$markedUpText );
+       }
+
+       public function testHandleEmptyElementTagInsideElement() {
+               $markedUpText = '<i>content<br /></i>';
+               $expectedCleanedContent = [
+                       Util::createStartTag( '<i>' ),
+                       'content',
+                       new CleanedEmptyElementTag( '<br />' ),
+                       new CleanedEndTag( '</i>' )
+               ];
+               $this->assertTextCleaned( $expectedCleanedContent, 
$markedUpText );
+       }
+
+       public function testGetTags() {
+               $textWithTags = '<i>content</i>';
+               $expectedTags = [ [
+                       [ 'string' => '<i>', 'position' => 0 ],
+                       [ 'string' => '</i>', 'position' => 10 ]
+               ] ];
+               $this->assertEquals(
+                       $expectedTags,
+                       Util::call( 'Cleaner', 'getTags', $textWithTags )
+               );
+       }
+
+       public function testGetTagsEmptyElementTag() {
+               $textWithTags = '<br />';
+               $expectedTags = [ '<br />' ];
+               $this->assertEquals(
+                       $expectedTags,
+                       Util::call( 'Cleaner', 'getTags', $textWithTags )
+               );
+       }
+
+       public function testGetTagsEmptyElementTagWithoutSpace() {
+               $textWithTags = '<br/>';
+               $expectedTags = [ '<br/>' ];
+               $this->assertEquals(
+                       $expectedTags,
+                       Util::call( 'Cleaner', 'getTags', $textWithTags )
+               );
+       }
+
+       public function testGetTagsNestedTags() {
+               $textWithTags = '<i>content<b>content</b></i>';
+               $expectedTags = [
+                       [
+                               [ 'string' => '<i>', 'position' => 0 ],
+                               [ 'string' => '</i>', 'position' => 24 ]
+                       ],
+                       [
+                               [ 'string' => '<b>', 'position' => 10 ],
+                               [ 'string' => '</b>', 'position' => 20 ]
+                       ]
+               ];
+               $this->assertEquals(
+                       $expectedTags,
+                       Util::call( 'Cleaner', 'getTags', $textWithTags )
+               );
+       }
+
+       public function testGetTagsNestedTagsOfSameType() {
+               $textWithTags = '<i id="1">content<i id="2">content</i></i>';
+               $expectedTags = [
+                       [
+                               [ 'string' => '<i id="1">', 'position' => 0 ],
+                               [ 'string' => '</i>', 'position' => 38 ]
+                       ],
+                       [
+                               [ 'string' => '<i id="2">', 'position' => 17 ],
+                               [ 'string' => '</i>', 'position' => 34 ]
+                       ]
+               ];
+               $this->assertEquals(
+                       $expectedTags,
+                       Util::call( 'Cleaner', 'getTags', $textWithTags )
+               );
        }
 }
diff --git a/tests/phpunit/HtmlGeneratorTest.php 
b/tests/phpunit/HtmlGeneratorTest.php
index 6f26234..929f0c4 100644
--- a/tests/phpunit/HtmlGeneratorTest.php
+++ b/tests/phpunit/HtmlGeneratorTest.php
@@ -7,39 +7,108 @@
  */
 
 require_once __DIR__ . '/../../includes/HtmlGenerator.php';
+require_once 'Util.php';
 
 class HtmlGeneratorTest extends MediaWikiTestCase {
-       public function testGenerateUtterancesHtml() {
-               $utterancesStrings = [ 'An utterance.', 'Another utterance.' ];
-               $actualHtml = HtmlGenerator::generateUtterancesHtml(
-                       $utterancesStrings
-               );
-               // @codingStandardsIgnoreStart
-               $expectedHtml = '<utterances hidden=""><utterance 
id="utterance-0"><text>An 
utterance.</text><audio></audio></utterance><utterance 
id="utterance-1"><text>Another 
utterance.</text><audio></audio></utterance></utterances>';
-               // @codingStandardsIgnoreEnd
-               $this->assertEquals( $expectedHtml, $actualHtml );
-       }
-
-       public function testGenerateUtteranceContainingNumberSign() {
-               // @codingStandardsIgnoreStart
-               $utterancesStrings = [ 'Blonde on Blonde spawned two singles 
that were top-twenty hits in the US: "Rainy Day Women #12 & 35" and "I Want 
You".'
+       public function testCreateUtteranceElement() {
+               $segment = [
+                       'position' => 0,
+                       'content' => [ 'An utterance.' ],
                ];
-               // @codingStandardsIgnoreEnd
-               $actualHtml = HtmlGenerator::generateUtterancesHtml(
-                       $utterancesStrings
+               $element = Util::call(
+                       'HtmlGenerator',
+                       'createUtteranceElement',
+                       new DOMDocument(),
+                       $segment,
+                       0
                );
+               $this->assertEquals( 'utterance', $element->nodeName );
+               $this->assertEquals( 'utterance-0', $element->getAttribute( 
'id' ) );
+               $this->assertEquals( 0, $element->getAttribute( 'position' ) );
+               $this->assertEquals( 'content', $element->firstChild->nodeName 
);
+               $this->assertEquals(
+                       'An utterance.',
+                       $element->firstChild->nodeValue
+               );
+       }
+
+       public function testCreateUtteranceContainingNumberSign() {
+               $segment = [
+                       'position' => 0,
+                       'content' => [ 'This is #1.' ]
+               ];
+               $element = Util::call(
+                       'HtmlGenerator',
+                       'createUtteranceElement',
+                       new DOMDocument,
+                       $segment,
+                       0
+               );
+               $this->assertEquals( 'This is #1.', 
$element->firstChild->nodeValue );
+       }
+
+       public function testCreateUtteranceContainingTagBrackets() {
+               $segment = [
+                       'position' => 0,
+                       'content' => [ 'This is not really a <tag>.' ]
+               ];
+               $element = Util::call(
+                       'HtmlGenerator',
+                       'createUtteranceElement',
+                       new DOMDocument,
+                       $segment,
+                       0
+               );
+               $this->assertEquals(
+                       'This is not really a <tag>.',
+                       $element->firstChild->nodeValue
+               );
+       }
+
+       public function testDontCreateUtterancesHtmlForNoUtterances() {
+               $segments = [];
+               $html = HtmlGenerator::createUtterancesHtml( $segments );
+               $expectedHtml = '';
+               $this->assertEquals( $expectedHtml, $html );
+       }
+
+       public function testCreateUtterancesMultipleUtterances() {
+               $segments = [
+                       [
+                               'position' => 0,
+                               'content' => [ 'Sentence 1.' ]
+                       ],
+                       [
+                               'position' => 11,
+                               'content' => [ ' Sentence 2.' ]
+                       ]
+
+               ];
+               $actualHtml = HtmlGenerator::createUtterancesHtml( $segments );
                // @codingStandardsIgnoreStart
-               $expectedHtml = '<utterances hidden=""><utterance 
id="utterance-0"><text>Blonde on Blonde spawned two singles that were 
top-twenty hits in the US: "Rainy Day Women #12 & 35" and "I Want 
You".</text><audio></audio></utterance></utterances>';
+               $expectedHtml =
+                       '<utterances hidden=""><utterance id="utterance-0" 
position="0"><content>Sentence 1.</content></utterance><utterance 
id="utterance-1" position="11"><content> Sentence 
2.</content></utterance></utterances>';
                // @codingStandardsIgnoreEnd
                $this->assertEquals( $expectedHtml, $actualHtml );
        }
 
-       public function testDontGenerateUtterancesHtmlForNoUtterances() {
-               $utterancesStrings = [];
-               $actualHtml = HtmlGenerator::generateUtterancesHtml(
-                       $utterancesStrings
-               );
-               $expectedHtml = '';
-               $this->assertEquals( $expectedHtml, $actualHtml );
+       public function testCreateUtterancesContainingRemovedTags() {
+               $segments = [
+                       [
+                               'position' => 0,
+                               'content' => [
+                                       'Here is a ',
+                                       Util::createStartTag( '<i>' ),
+                                       'tag',
+                                       new CleanedEndTag( '</i>' )
+                               ]
+                       ]
+               ];
+               $html = HtmlGenerator::createUtterancesHtml( $segments );
+               // @codingStandardsIgnoreStart
+               $expectedHtml =
+                       '<utterances hidden=""><utterance id="utterance-0" 
position="0"><content>Here is a 
<cleaned-tag>i</cleaned-tag>tag<cleaned-tag>/i</cleaned-tag></content></utterance></utterances>';
+               // @codingStandardsIgnoreEnd
+               $this->assertEquals( $expectedHtml, $html );
        }
 }
diff --git a/tests/phpunit/SegmenterTest.php b/tests/phpunit/SegmenterTest.php
index 4f70d97..0b4f6cf 100644
--- a/tests/phpunit/SegmenterTest.php
+++ b/tests/phpunit/SegmenterTest.php
@@ -7,74 +7,174 @@
  */
 
 require_once __DIR__ . '/../../includes/Segmenter.php';
+require_once 'Util.php';
 
 class SegmenterTest extends MediaWikiTestCase {
-
        public function testSegmentSentences() {
-               // @codingStandardsIgnoreStart
-               $input = "Blonde on Blonde is the seventh studio album by 
American singer-songwriter Bob Dylan, released on May 16, 1966, on Columbia 
Records. Recording sessions began in New York in October 1965 with numerous 
backing musicians, including members of Dylan's live backing band, the Hawks.";
+               $cleanedContent = [
+                       'Sentence 1. Sentence 2.'
+               ];
                $expectedSegments = [
-                       'Blonde on Blonde is the seventh studio album by 
American singer-songwriter Bob Dylan, released on May 16, 1966, on Columbia 
Records.',
-                       "Recording sessions began in New York in October 1965 
with numerous backing musicians, including members of Dylan's live backing 
band, the Hawks." ];
-               // @codingStandardsIgnoreEnd
-               $segments = Segmenter::segmentSentences( $input );
+                       [
+                               'position' => 0,
+                               'content' => [ 'Sentence 1.' ]
+                       ],
+                       [
+                               'position' => 11,
+                               'content' => [ ' Sentence 2.' ]
+                       ]
+               ];
+               $segments = Segmenter::segmentSentences( $cleanedContent );
                $this->assertEquals( $expectedSegments, $segments );
        }
 
        public function testDontSegmentByEllipses() {
-               $input = "I mean, in ten recording sessions, man, we didn't get 
one song...It was the band.";
+               $cleanedContent = [
+                       'This is... one sentence.'
+                       ];
                $expectedSegments = [
-                       "I mean, in ten recording sessions, man, we didn't get 
one song...It was the band." ];
-               $segments = Segmenter::segmentSentences( $input );
+                       [
+                               'position' => 0,
+                               'content' => [ 'This is... one sentence.' ]
+                       ]
+               ];
+               $segments = Segmenter::segmentSentences( $cleanedContent );
                $this->assertEquals( $expectedSegments, $segments );
        }
 
        public function testDontSegmentByAbbreviations() {
-               // @codingStandardsIgnoreStart
-               $input = 'On February 15 the session began at 6&nbsp;p.m. but 
Dylan simply sat in the studio working on his lyrics while the musicians played 
cards, napped and chatted.';
+               $cleanedContent = [ 'One sentence i.e. one segment.' ];
                $expectedSegments = [
-                       'On February 15 the session began at 6&nbsp;p.m. but 
Dylan simply sat in the studio working on his lyrics while the musicians played 
cards, napped and chatted.' ];
-               // @codingStandardsIgnoreEnd
-               $segments = Segmenter::segmentSentences( $input );
+                       [
+                               'position' => 0,
+                               'content' => [ 'One sentence i.e. one segment.' 
]
+                       ]
+               ];
+               $segments = Segmenter::segmentSentences( $cleanedContent );
                $this->assertEquals( $expectedSegments, $segments );
        }
 
        public function testDontSegmentByDotDirectlyFollowedByComma() {
-               // @codingStandardsIgnoreStart
-               $input = 'Two people had strongly recommended the Hawks to 
Dylan: Mary Martin, the executive secretary of Albert Grossman, and blues 
singer John Hammond, Jr., son of record producer John Hammond, who had signed 
Dylan to Columbia Records in 1961; the Hawks had backed the younger Hammond on 
his 1965 album So Many Roads.';
+               $cleanedContent = [ 'As with etc., jr. and friends.' ];
                $expectedSegments = [
-                       'Two people had strongly recommended the Hawks to 
Dylan: Mary Martin, the executive secretary of Albert Grossman, and blues 
singer John Hammond, Jr., son of record producer John Hammond, who had signed 
Dylan to Columbia Records in 1961; the Hawks had backed the younger Hammond on 
his 1965 album So Many Roads.' ];
-               // @codingStandardsIgnoreEnd
-               $segments = Segmenter::segmentSentences( $input );
-               $this->assertEquals( $expectedSegments, $segments );
-       }
-
-       public function testDontRemoveStringsWithoutDots() {
-               $input = "Recording sessions\n\nBackground";
-               $expectedSegments = [ 'Recording sessions', 'Background' ];
-               $segments = Segmenter::segmentSentences( $input );
-               $this->assertEquals( $expectedSegments, $segments );
-       }
-
-       public function testSegmentParagraphs() {
-               $input = "Recording sessions
-
-Background
-After the release of Highway 61 Revisited in August 1965, Dylan set ...";
-               $expectedSegments = [
-                       'Recording sessions',
-                       'Background',
-                       'After the release of Highway 61 Revisited in August 
1965, Dylan set ...' ];
-               $segments = Segmenter::segmentParagraphs( $input );
+                       [
+                               'position' => 0,
+                               'content' => [ 'As with etc., jr. and friends.' 
]
+                       ]
+               ];
+               $segments = Segmenter::segmentSentences( $cleanedContent );
                $this->assertEquals( $expectedSegments, $segments );
        }
 
        public function testDontSegmentByDecimalDot() {
-               $input = 'the two-CD set went on sale for $18.99 and the 
three-CD version for $129.99';
-               // @codingStandardsIgnoreStart
-               $expectedSegments = [ 'the two-CD set went on sale for $18.99 
and the three-CD version for $129.99' ];
-               // @codingStandardsIgnoreEnd
-               $segments = Segmenter::segmentParagraphs( $input );
+               $cleanedContent = [ 'In numbers like 2.9.' ];
+               $expectedSegments = [
+                       [
+                               'position' => 0,
+                               'content' => [ 'In numbers like 2.9.' ]
+                       ]
+               ];
+               $segments = Segmenter::segmentSentences( $cleanedContent );
+               $this->assertEquals( $expectedSegments, $segments );
+       }
+
+       public function 
testKeepLastSegmentEvenIfNotEndingWithSentenceFinalCharacter() {
+               $cleanedContent = [ 'Recording sessions' ];
+               $expectedSegments = [
+                       [
+                               'position' => 0,
+                               'content' => [ 'Recording sessions' ]
+                       ]
+               ];
+               $segments = Segmenter::segmentSentences( $cleanedContent );
+               $this->assertEquals( $expectedSegments, $segments );
+       }
+
+       public function testSegmentContainingTag() {
+               $cleanedContent = [
+                       'Sentence with a ',
+                       Util::createStartTag( '<i>' ),
+                       'tag',
+                       new CleanedEndTag( '</i>' ),
+                       '.'
+               ];
+               $expectedSegments = [
+                       [
+                               'position' => 0,
+                               'content' => [
+                                       'Sentence with a ',
+                                       Util::createStartTag( '<i>' ),
+                                       'tag',
+                                       new CleanedEndTag( '</i>' ),
+                                       '.'
+                               ]
+                       ]
+               ];
+               $segments = Segmenter::segmentSentences( $cleanedContent );
+               $this->assertEquals( $expectedSegments, $segments );
+       }
+
+       public function testSegmentEndingWithTag() {
+               $cleanedContent = [
+                       "There's a tag after this",
+                       new CleanedEmptyElementTag( '<br />' )
+               ];
+               $expectedSegments = [
+                       [
+                               'position' => 0,
+                               'content' => [
+                                       "There's a tag after this",
+                                       new CleanedEmptyElementTag( '<br />' )
+                               ]
+                       ]
+               ];
+               $segments = Segmenter::segmentSentences( $cleanedContent );
+               $this->assertEquals( $expectedSegments, $segments );
+       }
+
+       public function testCalculatePosition() {
+               $cleanedContent = [ 'Segment 1.', 'Segment 2.', 'Segment 3.' ];
+               $expectedSegments = [
+                       [
+                               'position' => 0,
+                               'content' => [ 'Segment 1.' ]
+                       ],
+                       [
+                               'position' => 10,
+                               'content' => [ 'Segment 2.' ]
+                       ],
+                       [
+                               'position' => 20,
+                               'content' => [ 'Segment 3.' ]
+                       ],
+               ];
+               $segments = Segmenter::segmentSentences( $cleanedContent );
+               $this->assertEquals( $expectedSegments, $segments );
+       }
+
+       public function testCalculatePositionWhenTagIsRemoved() {
+               $cleanedContent = [
+                       'Sentence with a ',
+                       Util::createStartTag( '<del>', 'removed ' ),
+                       new CleanedEndTag( '</del>' ),
+                       'tag. Another sentence.'
+               ];
+               $expectedSegments = [
+                       [
+                               'position' => 0,
+                               'content' => [
+                                       'Sentence with a ',
+                                       Util::createStartTag( '<del>', 'removed 
' ),
+                                       new CleanedEndTag( '</del>' ),
+                                       'tag.',
+                               ]
+                       ],
+                       [
+                               'position' => 39,
+                               'content' => [ ' Another sentence.' ]
+                       ]
+               ];
+               $segments = Segmenter::segmentSentences( $cleanedContent );
                $this->assertEquals( $expectedSegments, $segments );
        }
 }
diff --git a/tests/phpunit/Util.php b/tests/phpunit/Util.php
new file mode 100644
index 0000000..a526fb7
--- /dev/null
+++ b/tests/phpunit/Util.php
@@ -0,0 +1,53 @@
+<?php
+
+/**
+ * @file
+ * @ingroup Extensions
+ * @license GPL-2.0+
+ */
+
+class Util {
+
+       /**
+        * Create a CleanedStartTag and set it's $contentLength.
+        *
+        * @since 0.0.1
+        * @param string $tagString The tag string for the CleanedStartTag.
+        * @param string $contentString The content string, used for
+        *  calculating $contentLength for the CleanedStartTag, if not
+        *  null. null by default.
+        * @return CleanedStartTag
+        */
+
+       public static function createStartTag(
+               $tagString,
+               $contentString=null
+       ) {
+               $cleanedTag = new CleanedStartTag( $tagString );
+               if ( $contentString != null ) {
+                       $cleanedTag->contentLength = strlen( $contentString );
+               }
+               return $cleanedTag;
+       }
+
+       /**
+        * Call a private function.
+        *
+        * Used for testing functions that normally can't be called in
+        * tests. Any arguments beyond $class and $function are sent as
+        * arguments to $function.
+        *
+        * @since 0.0.1
+        * @param string $class The name of the class that holds the function.
+        * @param string $function The name of the function to call
+        * @return Whatever $function returns
+        */
+
+       public static function call( $class, $function ) {
+               $reflection = new ReflectionMethod( $class, $function );
+               $reflection->setAccessible( true );
+               $arguments = array_slice( func_get_args(), 2 );
+               return $reflection->invokeArgs( null, $arguments );
+       }
+
+}
diff --git a/tests/qunit/ext.wikispeech.test.js 
b/tests/qunit/ext.wikispeech.test.js
index 9ae7fb2..4733b62 100644
--- a/tests/qunit/ext.wikispeech.test.js
+++ b/tests/qunit/ext.wikispeech.test.js
@@ -3,22 +3,26 @@
 
        QUnit.module( 'ext.wikispeech', {
                setup: function () {
+                       var $utterances;
+
                        wikispeech = new mw.wikispeech.Wikispeech();
                        server = sinon.fakeServer.create();
-                       server.respondWith( '{"audio": 
"http://server.url/audio"}' );
                        // overrideMimeType() isn't defined by default.
                        server.xhr.prototype.overrideMimeType = function () {};
-                       $( '#qunit-fixture' ).append( createUtteranceElement(
-                               'utterance-0',
-                               'A mockup utterance.'
-                       ) );
-                       $( '#qunit-fixture' ).append( createUtteranceElement(
-                               'utterance-1',
-                               'Another mockup utterance.'
-                       ) );
                        $( '#qunit-fixture' ).append(
                                $( '<h1></h1>' ).attr( 'id', 'firstHeading' )
                        );
+                       $utterances = $( '#qunit-fixture' ).append(
+                               $( '<utterances></utterances>' )
+                       );
+                       $( '<utterance></utterance>' )
+                               .attr( 'id', 'utterance-0' )
+                               .attr( 'position', 0 )
+                               .appendTo( $utterances );
+                       $( '<utterance></utterance>' )
+                               .attr( 'id', 'utterance-1' )
+                               .attr( 'position', 1 )
+                               .appendTo( $utterances );
                        mw.config.set(
                                'wgWikispeechKeyboardShortcuts', {
                                        playStop: {
@@ -36,14 +40,6 @@
                        server.restore();
                }
        } );
-
-       function createUtteranceElement( id, text ) {
-               return $( '<utterance></utterance>' )
-                       .attr( 'id', id )
-                       .append( $( '<text></text>' )
-                               .text( text ) )
-                       .append( $( '<audio></audio>' ) );
-       }
 
        QUnit.test( 'prepareUtterance', function ( assert ) {
                assert.expect( 1 );
@@ -70,11 +66,9 @@
        } );
 
        QUnit.test( 'prepareUtterance: prepare next utterance when playing', 
function ( assert ) {
-               var $nextUtterance;
-
+               var $nextUtterance = $( '#utterance-1' );
                assert.expect( 1 );
                wikispeech.prepareUtterance( $( '#utterance-0' ) );
-               $nextUtterance = $( '#utterance-1' );
                sinon.spy( wikispeech, 'prepareUtterance' );
 
                $( '#utterance-0 audio' ).trigger( 'play' );
@@ -115,11 +109,9 @@
        } );
 
        QUnit.test( 'prepareUtterance: stop when end of text is reached', 
function ( assert ) {
-               var $lastUtterance;
-
+               var $lastUtterance = $( '#utterance-1' );
                assert.expect( 1 );
                sinon.spy( wikispeech, 'stop' );
-               $lastUtterance = $( '#utterance-1' );
                wikispeech.prepareUtterance( $lastUtterance );
                wikispeech.playUtterance( $lastUtterance );
 
@@ -129,21 +121,32 @@
        } );
 
        QUnit.test( 'loadAudio', function ( assert ) {
-               assert.expect( 3 );
+               assert.expect( 4 );
+               $( '<content></content>' )
+                       .append( 'An utterance.' )
+                       .appendTo( $( '#utterance-0' ) );
+               server.respondWith(
+                       '{"audio": "http://server.url/audio";, "tokens": 
[{"orth": "An"}, {"orth": "utterance"}, {"orth": "."}]}'
+               );
+               sinon.spy( wikispeech, 'addTokenElements' );
 
                wikispeech.loadAudio( $( '#utterance-0' ) );
 
                server.respond();
                assert.strictEqual(
                        server.requests[ 0 ].requestBody,
-                       'lang=en&input_type=text&input=A+mockup+utterance.'
+                       'lang=en&input_type=text&input=An+utterance.'
                );
                assert.strictEqual(
                        $( '#utterance-0 audio' ).attr( 'src' ),
                        'http://server.url/audio'
                );
+               assert.strictEqual( $( '#utterance-0' ).prop( 'requested' ), 
true );
                assert.strictEqual(
-                       $( '#utterance-0' ).prop( 'requested' ),
+                       wikispeech.addTokenElements.calledWith(
+                               $( '#utterance-0' ),
+                               [ { orth: 'An' }, { orth: 'utterance' }, { 
orth: '.' } ]
+                       ),
                        true
                );
        } );
@@ -208,9 +211,7 @@
         */
 
        function createKeydownEvent( keyCode, modifiers ) {
-               var event;
-
-               event = $.Event( 'keydown' );
+               var event = $.Event( 'keydown' );
                event.which = keyCode;
                event.ctrlKey = modifiers.indexOf( 'c' ) >= 0;
                event.altKey = modifiers.indexOf( 'a' ) >= 0;
@@ -232,6 +233,7 @@
                assert.expect( 4 );
                wikispeech.addPlayStopButton();
                wikispeech.play();
+               wikispeech.prepareUtterance( $( '#utterance-0' ) );
                $( '#utterance-0 audio' ).prop( 'currentTime', 1 );
 
                wikispeech.stop();
@@ -254,11 +256,10 @@
        } );
 
        QUnit.test( 'play', function ( assert ) {
-               var $firstUtterance;
-
+               var $firstUtterance = $( '#utterance-0' );
                assert.expect( 3 );
                wikispeech.addPlayStopButton();
-               $firstUtterance = $( '#utterance-0' );
+               wikispeech.prepareUtterance( $firstUtterance );
 
                wikispeech.play();
 
@@ -293,6 +294,7 @@
 
        QUnit.test( 'skipAheadUtterance', function ( assert ) {
                assert.expect( 2 );
+               wikispeech.prepareUtterance( $( '#utterance-0' ) );
                wikispeech.play();
 
                wikispeech.skipAheadUtterance();
@@ -318,8 +320,8 @@
                var $nextUtterance;
 
                assert.expect( 1 );
-               $nextUtterance =
-                       wikispeech.getNextUtterance( $( '#utterance-0' ) );
+
+               $nextUtterance = wikispeech.getNextUtterance( $( '#utterance-0' 
) );
 
                assert.strictEqual(
                        $nextUtterance.get( 0 ),
@@ -331,8 +333,152 @@
                var $nextUtterance;
 
                assert.expect( 1 );
+
                $nextUtterance = wikispeech.getNextUtterance( $() );
 
                assert.strictEqual( $nextUtterance.length, 0 );
        } );
+
+       QUnit.test( 'addTokenElements', function ( assert ) {
+               var tokens, $tokensElement, $expectedTokensElement;
+
+               $( '<content></content>' ).html( 'An utterance.' )
+                       .appendTo( $( '#utterance-0' ) );
+               tokens = [
+                       { orth: 'An' },
+                       { orth: 'utterance' },
+                       { orth: '.' }
+               ];
+
+               wikispeech.addTokenElements( $( '#utterance-0' ), tokens );
+
+               $tokensElement = $( '#utterance-0' ).children( 'tokens' );
+               $expectedTokensElement = $( '<tokens></tokens>' )
+                       .append(
+                               $( '<token></token>' )
+                                       .text( 'An' )
+                                       .attr( 'position', 0 )
+                       )
+                       .append(
+                               $( '<token></token>' )
+                                       .text( 'utterance' )
+                                       .attr( 'position', 3 )
+                       )
+                       .append(
+                               $( '<token></token>' )
+                                       .text( '.' )
+                                       .attr( 'position', 12 )
+                       );
+               assert.strictEqual(
+                       $tokensElement.prop( 'outerHTML' ),
+                       $expectedTokensElement.prop( 'outerHTML' )
+               );
+       } );
+
+       QUnit.test( 'addTokenElements: handle tags', function ( assert ) {
+               var tokens, $tokensElement, $expectedTokensElement;
+
+               $( '<content></content>' ).html(
+                       'Utterance with 
<cleaned-tag>b</cleaned-tag>tag<cleaned-tag>/b</cleaned-tag>.'
+               )
+                       .appendTo( $( '#utterance-0' ) );
+               tokens = [
+                       { orth: 'Utterance' },
+                       { orth: 'with' },
+                       { orth: 'tag' },
+                       { orth: '.' }
+               ];
+
+               wikispeech.addTokenElements( $( '#utterance-0' ), tokens );
+
+               $tokensElement = $( '#utterance-0' ).children( 'tokens' );
+               $expectedTokensElement = $( '<tokens></tokens>' )
+                       .append(
+                               $( '<token></token>' )
+                                       .text( 'Utterance' )
+                                       .attr( 'position', 0 )
+                       )
+                       .append(
+                               $( '<token></token>' )
+                                       .text( 'with' )
+                                       .attr( 'position', 10 )
+                       )
+                       .append(
+                               $( '<token></token>' )
+                                       .text( 'tag' )
+                                       .attr( 'position', 18 )
+                       )
+                       .append(
+                               $( '<token></token>' )
+                                       .text( '.' )
+                                       .attr( 'position', 25 )
+                       );
+               assert.strictEqual(
+                       $tokensElement.prop( 'outerHTML' ),
+                       $expectedTokensElement.prop( 'outerHTML' )
+               );
+       } );
+
+       QUnit.test( 'addTokenElements: utterance position offset', function ( 
assert ) {
+               var tokens, $tokensElement, $expectedTokensElement;
+
+               $( '<content></content>' ).html( 'An utterance.' )
+                       .appendTo( $( '#utterance-0' ) );
+               $( '#utterance-0' ).attr( 'position', 3 );
+               tokens = [
+                       { orth: 'An' },
+                       { orth: 'utterance' },
+                       { orth: '.' }
+               ];
+
+               wikispeech.addTokenElements( $( '#utterance-0' ), tokens );
+
+               $tokensElement = $( '#utterance-0' ).children( 'tokens' );
+               $expectedTokensElement = $( '<tokens></tokens>' )
+                       .append( $( '<token></token>' )
+                                       .text( 'An' )
+                                       .attr( 'position', 3 ) )
+                       .append( $( '<token></token>' )
+                                       .text( 'utterance' )
+                                       .attr( 'position', 6 ) )
+                       .append( $( '<token></token>' )
+                                       .text( '.' )
+                                       .attr( 'position', 15 ) );
+               assert.strictEqual(
+                       $tokensElement.prop( 'outerHTML' ),
+                       $expectedTokensElement.prop( 'outerHTML' )
+               );
+       } );
+
+       QUnit.test( 'addTokenElements: handle removed element', function ( 
assert ) {
+               var tokens, $tokensElement, $expectedTokensElement;
+
+               $( '<content></content>' ).html(
+                       'Utterance with <cleaned-tag>del</cleaned-tag>removed 
tag<cleaned-tag>/del</cleaned-tag>.'
+               )
+                       .appendTo( $( '#utterance-0' ) );
+               tokens = [
+                       { orth: 'Utterance' },
+                       { orth: 'with' },
+                       { orth: '.' }
+               ];
+
+               wikispeech.addTokenElements( $( '#utterance-0' ), tokens );
+
+               $tokensElement = $( '#utterance-0' ).children( 'tokens' );
+               $expectedTokensElement = $( '<tokens></tokens>' )
+                       .append( $( '<token></token>' )
+                                       .text( 'Utterance' )
+                                       .attr( 'position', 0 ) )
+                       .append( $( '<token></token>' )
+                                       .text( 'with' )
+                                       .attr( 'position', 10 ) )
+                       .append( $( '<token></token>' )
+                                       .text( '.' )
+                                       .attr( 'position', 37 ) );
+               assert.strictEqual(
+                       $tokensElement.prop( 'outerHTML' ),
+                       $expectedTokensElement.prop( 'outerHTML' )
+               );
+       } );
 } )( mediaWiki, jQuery );

-- 
To view, visit https://gerrit.wikimedia.org/r/314237
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ie784328fa3d7bcf7941b6b89146687272fe3b0ca
Gerrit-PatchSet: 5
Gerrit-Project: mediawiki/extensions/Wikispeech
Gerrit-Branch: master
Gerrit-Owner: Sebastian Berlin (WMSE) <[email protected]>
Gerrit-Reviewer: Lokal Profil <[email protected]>
Gerrit-Reviewer: Sebastian Berlin (WMSE) <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] mediawiki...Wikispeech[master]: Map tokens from TTS responses to HTML

Reply via email to