Gerrit Patch Uploader has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/343243 )
Change subject: Remove broken plaintext section markers ...................................................................... Remove broken plaintext section markers The heading tags from getExtract is useful even in plaintext output mode. These tags, scrambled to a special format by ExtractFormatter, are used by doSection(), which looks for such markers and reformat them as specified by sectionformat. When truncated, incomplete markers will become U+FFFD characters in the API output (see bug below). This patch modifies ApiQueryExtracts::tidy( $text ) to search for and eliminate broken markers at the end. Change-Id: I7f88ad8179f3837a2db57abbb4352963e51544db --- M includes/ApiQueryExtracts.php 1 file changed, 37 insertions(+), 2 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/TextExtracts refs/changes/43/343243/1 diff --git a/includes/ApiQueryExtracts.php b/includes/ApiQueryExtracts.php index f80b90b..f0b7682 100644 --- a/includes/ApiQueryExtracts.php +++ b/includes/ApiQueryExtracts.php @@ -318,17 +318,52 @@ } /** - * A simple wrapper around tidy + * Cleans up the text. Wraps tidy for HTML, trims out broken section markers for plaintext * @param string $text * @return string */ private function tidy( $text ) { - if ( $this->getConfig()->get( 'UseTidy' ) && !$this->params['plaintext'] ) { + static $regexp = false; + if ( $this->params['plaintext'] ) { + if ( $regexp === false ) { + // Prepare a structure for making partial regexes + $split_atoms = array_merge( + str_split(ExtractFormatter::SECTION_MARKER_START), + array('\d'), + // A complete marker not followed by text is useless, trim it too + // (Still no guarantee for partial heading text) + str_split(ExtractFormatter::SECTION_MARKER_END) + ); + // Look for truncated section markers at the very end + // (That's where truncation happens) + $regexp = $this->makeTruncatedRegex( $split_atoms ) . '$/D'; + } + + $text = preg_replace( $regexp, "", $text ); + } elseif ( $this->getConfig()->get( 'UseTidy' ) ) { $text = trim( MWTidy::tidy( $text ) ); } return $text; } + /* Constructs a regular expression for matching partial substrings from a list of atoms. + * @param array $atoms + * @return string + */ + private function makeTruncatedRegex( $atoms ) { + const OPEN = "(?:"; // open a non-capture group + const CLOSE = ")?"; // end an optional group + + // Maybe use a join() with a hardcoded "?" next time + $regexp = '/'; + foreach ( array_expression as $atom ) { + $regexp .= OPEN . $atom; + } + $regexp .= str_repeat( CLOSE, count( $atoms ) ) . '/'; + + return $regexp; + } + private function doSections( $text ) { $text = preg_replace_callback( "/" . ExtractFormatter::SECTION_MARKER_START . '(\d)'. ExtractFormatter::SECTION_MARKER_END . "(.*?)$/m", -- To view, visit https://gerrit.wikimedia.org/r/343243 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I7f88ad8179f3837a2db57abbb4352963e51544db Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/TextExtracts Gerrit-Branch: master Gerrit-Owner: Gerrit Patch Uploader <gerritpatchuploa...@gmail.com> Gerrit-Reviewer: Artoria2e5 <arthur200...@gmail.com> Gerrit-Reviewer: Gerrit Patch Uploader <gerritpatchuploa...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits