Gerrit Patch Uploader has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/344791 )
Change subject: Remove broken plaintext section markers ...................................................................... Remove broken plaintext section markers The heading tags from getExtract is useful even in plaintext output mode. These tags, scrambled to a special format by ExtractFormatter, are used by doSection(), which looks for such markers and reformat them as specified by sectionformat. When truncated, incomplete markers will become U+FFFD characters in the API output (see bug below). This patch: * Adds a public method ExtractFormatter::tidySectionMarkers( $text ) to search for and eliminate broken markers at the end. * Adds a private helper method to construct a regex for the above method. * Modifies ApiQueryExtracts::tidy( $text ) to call the first method for plaintext. Bug: T92628 Change-Id: Ib91fab62601087c891fba0e7fadea7d249c7f18b Change-Id: I010a79b8dbfe35facc2450b7d148aa7a85716019 --- M includes/ApiQueryExtracts.php M includes/ExtractFormatter.php M tests/phpunit/ExtractFormatterTest.php 3 files changed, 91 insertions(+), 2 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/TextExtracts refs/changes/91/344791/1 diff --git a/includes/ApiQueryExtracts.php b/includes/ApiQueryExtracts.php index f80b90b..af24aa1 100644 --- a/includes/ApiQueryExtracts.php +++ b/includes/ApiQueryExtracts.php @@ -318,12 +318,14 @@ } /** - * A simple wrapper around tidy + * Cleans up the text. Wraps tidy for HTML, trims out broken section markers for plaintext * @param string $text * @return string */ private function tidy( $text ) { - if ( $this->getConfig()->get( 'UseTidy' ) && !$this->params['plaintext'] ) { + if ( $this->params['plaintext'] ) { + $text = ExtractFormatter::tidySectionMarkers( $text ); + } elseif ( $this->getConfig()->get( 'UseTidy' ) ) { $text = trim( MWTidy::tidy( $text ) ); } return $text; diff --git a/includes/ExtractFormatter.php b/includes/ExtractFormatter.php index 4dd7e92..877cbfd 100644 --- a/includes/ExtractFormatter.php +++ b/includes/ExtractFormatter.php @@ -151,4 +151,49 @@ return $removed; } + + /** + * Trims out broken section markers that would lead to U+FFFD in API response. + * + * @param string $text + * @return string + */ + public static function tidySectionMarkers( $text ) { + static $regexp = false; + if ( $regexp === false ) { + // Prepare a structure for making partial regexes + $split_atoms = array_merge( + str_split(self::SECTION_MARKER_START), + ['\d'], + // A complete marker not followed by text is useless, trim it too + // (Still no guarantee for partial heading text) + str_split(self::SECTION_MARKER_END) + ); + // Look for truncated section markers at the very end + // (That's where truncation happens) + $regexp = '/\n\n' . self::makeTruncatedRegex( $split_atoms ) . "$/D"; + } + + return preg_replace( $regexp, "", $text ); + } + + /** + * Constructs a regexp fragment for matching partial substrings from a list of atoms. + * + * @param array $atoms + * @return string + */ + private static function makeTruncatedRegex( $atoms ) { + $open = "(?:"; // open a non-capture group + $close = ")?"; // end an optional group + + // Maybe use a join() with a hardcoded "?" next time + $regexp = ''; + foreach ( $atom as $atom ) { + $regexp .= $open . $atom; + } + $regexp .= str_repeat( $close, count( $atoms ) ); + + return $regexp; + } } diff --git a/tests/phpunit/ExtractFormatterTest.php b/tests/phpunit/ExtractFormatterTest.php index 064871d..5c869bc 100644 --- a/tests/phpunit/ExtractFormatterTest.php +++ b/tests/phpunit/ExtractFormatterTest.php @@ -181,4 +181,46 @@ [ $longText, 65536, $longTextExpected ], ]; } + + /** + * @dataProvider provideTidySectionMarkers + * @param $text + * @param $expected + */ + public function testTidySectionMarkers( $text, $expected ) { + $this->assertEquals( $expected, ExtractFormatter::tidySectionMarkers( $text ) ); + } + + public function provideTidySectionMarkers() { + $start = ExtractFormatter::SECTION_MARKER_START; + $start = $start; + $end = ExtractFormatter::SECTION_MARKER_END; + $broken = $start . '2' . $end; + $broken_length = strlen( $broken ); + + $expected = 'Lorem Ipsum Baa Baa Sheep'; + $expected .= "\n\n" . $start . '2' . $end . "Section 1\n" ; + $expected .= 'This section is meaningless. The next one is broken.'; + $expected .= "\n\n"; // Always appears before sections + + // It should be able to remove all broken slices + $cases = []; + for ($len = 0; $len <= $broken_length; $len++) { + $cases[] = [ + $expected . substr( $broken, 0, $len ), + $expected + ]; + } + + // It should not cut away anything that follows + $cases[] = [ + $expected . $broken . "N", + $expected . $broken . "N" + ]; + $cases[] = [ + $expected . $broken . "Not broken at all\n", + $expected . $broken . "Not broken at all\n" + ]; + return $cases; + } } -- To view, visit https://gerrit.wikimedia.org/r/344791 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I010a79b8dbfe35facc2450b7d148aa7a85716019 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/TextExtracts Gerrit-Branch: master Gerrit-Owner: Gerrit Patch Uploader <gerritpatchuploa...@gmail.com> Gerrit-Reviewer: Artoria2e5 <arthur200...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits