Sebastian Berlin (WMSE) has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/298256

Change subject: Segment text
......................................................................

Segment text

Added functionality to segment text into reasonable size for sending to the
TTS. The Segmenter class has two functions, one for segmenting on paragraph
level and one for sentence level. The latter is still quite basic, and may
need extending to cover specific cases.

Bug: T135980
Change-Id: Ie1f839e1ccd6f4928522de8e22a50ae474dbda04
---
A includes/Segmenter.php
A tests/phpunit/SegmenterTest.php
2 files changed, 136 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Wikispeech 
refs/changes/56/298256/1

diff --git a/includes/Segmenter.php b/includes/Segmenter.php
new file mode 100644
index 0000000..d986a6a
--- /dev/null
+++ b/includes/Segmenter.php
@@ -0,0 +1,59 @@
+<?php
+
+/**
+ * @file
+ * @ingroup Extensions
+ * @license GPL-3.0+
+ */
+
+class Segmenter {
+
+       /**
+        * Split a string by sentence final dots and new lines.
+        *
+        * @since 0.0.1
+        * @param string $text A string to segment.
+        * @return array The segments found.
+        */
+
+       public function segmentSentences( $text ) {
+               // The regex checks for dots when they are not to closely 
preceeded
+               // or followes by another dot, to prevent segmenting on 
abbreviations.
+               // For the same reason, a dot isn't counted if it's followed by 
a
+               // comma. Segmentation also happens if there is a line break, to
+               // properly handle headings.
+               preg_match_all(
+                       
"/(\s*((.+?)(?<=[^.]{10})[.,]*[.]((?=[^,])(?=[^.]{10})|$))|\n*([^.\n]+))/",
+                       $text,
+                       $matches,
+                       PREG_SET_ORDER );
+               $segments = [];
+               foreach ( $matches as $match ) {
+                       if ( $match[ 2 ] != '' ) {
+                               array_push( $segments, $match[ 2 ] );
+                       } elseif ( $match[ 5 ] != '' ) {
+                               array_push( $segments, $match[ 5 ] );
+                       }
+               }
+               return $segments;
+       }
+
+       /**
+        * Split a string by newline.
+        *
+        * @since 0.0.1
+        * @param string $text A string to segment.
+        * @return array The segments found. Segments only containing 
whitespaces
+        * are discarded.
+        */
+
+       public function segmentParagraphs( $text ) {
+               $segments = [];
+               foreach ( explode( "\n", $text ) as $segment ) {
+                       if ( strlen( trim( $segment ) ) > 0 ) {
+                               array_push( $segments, $segment );
+                       }
+               }
+               return $segments;
+       }
+}
diff --git a/tests/phpunit/SegmenterTest.php b/tests/phpunit/SegmenterTest.php
new file mode 100644
index 0000000..81a88ac
--- /dev/null
+++ b/tests/phpunit/SegmenterTest.php
@@ -0,0 +1,77 @@
+<?php
+
+require_once __DIR__ . '/../../includes/Segmenter.php';
+
+/**
+ * @file
+ * @ingroup Extensions
+ * @license GPL-3.0+
+ */
+
+class SegmenterTest extends MediaWikiTestCase {
+
+       public function testSegmentSentences() {
+               // @codingStandardsIgnoreStart
+               $input = "Blonde on Blonde is the seventh studio album by 
American singer-songwriter Bob Dylan, released on May 16, 1966, on Columbia 
Records. Recording sessions began in New York in October 1965 with numerous 
backing musicians, including members of Dylan's live backing band, the Hawks.";
+               $expectedSegments = [
+                       'Blonde on Blonde is the seventh studio album by 
American singer-songwriter Bob Dylan, released on May 16, 1966, on Columbia 
Records.',
+                       "Recording sessions began in New York in October 1965 
with numerous backing musicians, including members of Dylan's live backing 
band, the Hawks." ];
+               // @codingStandardsIgnoreEnd
+               $segments = Segmenter::segmentSentences( $input );
+               $this->assertEquals( $expectedSegments, $segments );
+       }
+
+       public function testSegmentByThreeDots() {
+               // @codingStandardsIgnoreStart
+               $input = "I mean, in ten recording sessions, man, we didn't get 
one song...It was the band.";
+               $expectedSegments = [
+                       "I mean, in ten recording sessions, man, we didn't get 
one song...",
+                       "It was the band." ];
+               // @codingStandardsIgnoreEnd
+               $segments = Segmenter::segmentSentences( $input );
+               $this->assertEquals( $expectedSegments, $segments );
+       }
+
+       public function testDontSegmentByAbbreviations() {
+               // @codingStandardsIgnoreStart
+               $input = 'Finally, at 4 a.m., Dylan called the musicians in and 
outlined the structure of the song. Dylan counted off and the musicians fell 
in, as he attempted his epic composition "Sad Eyed Lady of the Lowlands".';
+               $expectedSegments = [
+                       'Finally, at 4 a.m., Dylan called the musicians in and 
outlined the structure of the song.',
+                       'Dylan counted off and the musicians fell in, as he 
attempted his epic composition "Sad Eyed Lady of the Lowlands".' ];
+               // @codingStandardsIgnoreEnd
+               $segments = Segmenter::segmentSentences( $input );
+               $this->assertEquals( $expectedSegments, $segments );
+       }
+
+       public function testDontSegmentByDotDirectlyFollowedByComma() {
+               // @codingStandardsIgnoreStart
+               $input = 'Two people had strongly recommended the Hawks to 
Dylan: Mary Martin, the executive secretary of Albert Grossman, and blues 
singer John Hammond, Jr., son of record producer John Hammond, who had signed 
Dylan to Columbia Records in 1961; the Hawks had backed the younger Hammond on 
his 1965 album So Many Roads.';
+               $expectedSegments = [
+                       'Two people had strongly recommended the Hawks to 
Dylan: Mary Martin, the executive secretary of Albert Grossman, and blues 
singer John Hammond, Jr., son of record producer John Hammond, who had signed 
Dylan to Columbia Records in 1961; the Hawks had backed the younger Hammond on 
his 1965 album So Many Roads.' ];
+               // @codingStandardsIgnoreEnd
+               $segments = Segmenter::segmentSentences( $input );
+               $this->assertEquals( $expectedSegments, $segments );
+       }
+
+       public function testDontRemoveStringsWithoutDots() {
+               $input = "Recording sessions\n\nBackground";
+               $expectedSegments = [ 'Recording sessions', 'Background' ];
+               $segments = Segmenter::segmentSentences( $input );
+               $this->assertEquals( $expectedSegments, $segments );
+       }
+
+       public function testSegmentParagraphs() {
+               $input = "Recording sessions
+
+Background
+After the release of Highway 61 Revisited in August 1965, Dylan set ...";
+               // @codingStandardsIgnoreStart
+               $expectedSegments = [
+                       'Recording sessions',
+                       'Background',
+                       'After the release of Highway 61 Revisited in August 
1965, Dylan set ...' ];
+               // @codingStandardsIgnoreEnd
+               $segments = Segmenter::segmentParagraphs( $input );
+               $this->assertEquals( $expectedSegments, $segments );
+       }
+}

-- 
To view, visit https://gerrit.wikimedia.org/r/298256
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ie1f839e1ccd6f4928522de8e22a50ae474dbda04
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Wikispeech
Gerrit-Branch: master
Gerrit-Owner: Sebastian Berlin (WMSE) <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to