Gerrit Patch Uploader has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/344791 )

Change subject: Remove broken plaintext section markers
......................................................................

Remove broken plaintext section markers

The heading tags from getExtract is useful even in plaintext output
mode. These tags, scrambled to a special format by ExtractFormatter,
are used by doSection(), which looks for such markers and reformat
them as specified by sectionformat.

When truncated, incomplete markers will become U+FFFD characters in the
API output (see bug below). This patch:

* Adds a public method ExtractFormatter::tidySectionMarkers( $text ) to
  search for and eliminate broken markers at the end.
* Adds a private helper method to construct a regex for the above
  method.
* Modifies ApiQueryExtracts::tidy( $text ) to call the first method for
  plaintext.

Bug: T92628
Change-Id: Ib91fab62601087c891fba0e7fadea7d249c7f18b

Change-Id: I010a79b8dbfe35facc2450b7d148aa7a85716019
---
M includes/ApiQueryExtracts.php
M includes/ExtractFormatter.php
M tests/phpunit/ExtractFormatterTest.php
3 files changed, 91 insertions(+), 2 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/TextExtracts 
refs/changes/91/344791/1

diff --git a/includes/ApiQueryExtracts.php b/includes/ApiQueryExtracts.php
index f80b90b..af24aa1 100644
--- a/includes/ApiQueryExtracts.php
+++ b/includes/ApiQueryExtracts.php
@@ -318,12 +318,14 @@
        }
 
        /**
-        * A simple wrapper around tidy
+        * Cleans up the text. Wraps tidy for HTML, trims out broken section 
markers for plaintext
         * @param string $text
         * @return string
         */
        private function tidy( $text ) {
-               if ( $this->getConfig()->get( 'UseTidy' ) && 
!$this->params['plaintext'] ) {
+               if ( $this->params['plaintext'] ) {
+                       $text = ExtractFormatter::tidySectionMarkers( $text );
+               } elseif ( $this->getConfig()->get( 'UseTidy' ) ) {
                        $text = trim( MWTidy::tidy( $text ) );
                }
                return $text;
diff --git a/includes/ExtractFormatter.php b/includes/ExtractFormatter.php
index 4dd7e92..877cbfd 100644
--- a/includes/ExtractFormatter.php
+++ b/includes/ExtractFormatter.php
@@ -151,4 +151,49 @@
 
                return $removed;
        }
+
+       /**
+        * Trims out broken section markers that would lead to U+FFFD in API 
response.
+        *
+        * @param string $text
+        * @return string
+        */
+       public static function tidySectionMarkers( $text ) {
+               static $regexp = false;
+               if ( $regexp === false ) {
+                       // Prepare a structure for making partial regexes
+                       $split_atoms = array_merge(
+                               str_split(self::SECTION_MARKER_START),
+                               ['\d'],
+                               // A complete marker not followed by text is 
useless, trim it too
+                               // (Still no guarantee for partial heading text)
+                               str_split(self::SECTION_MARKER_END)
+                       );
+                       // Look for truncated section markers at the very end
+                       // (That's where truncation happens)
+                       $regexp = '/\n\n' . self::makeTruncatedRegex( 
$split_atoms ) . "$/D";
+               }
+
+               return preg_replace( $regexp, "", $text );
+       }
+
+       /**
+        * Constructs a regexp fragment for matching partial substrings from a 
list of atoms.
+        *
+        * @param array $atoms
+        * @return string
+        */
+       private static function makeTruncatedRegex( $atoms ) {
+               $open = "(?:";  // open a non-capture group
+               $close = ")?";  // end an optional group
+
+               // Maybe use a join() with a hardcoded "?" next time
+               $regexp = '';
+               foreach ( $atom as $atom ) {
+                       $regexp .= $open . $atom;
+               }
+               $regexp .= str_repeat( $close, count( $atoms ) );
+
+               return $regexp;
+       }
 }
diff --git a/tests/phpunit/ExtractFormatterTest.php 
b/tests/phpunit/ExtractFormatterTest.php
index 064871d..5c869bc 100644
--- a/tests/phpunit/ExtractFormatterTest.php
+++ b/tests/phpunit/ExtractFormatterTest.php
@@ -181,4 +181,46 @@
                        [ $longText, 65536, $longTextExpected ],
                ];
        }
+
+       /**
+        * @dataProvider provideTidySectionMarkers
+        * @param $text
+        * @param $expected
+        */
+       public function testTidySectionMarkers( $text, $expected ) {
+               $this->assertEquals( $expected, 
ExtractFormatter::tidySectionMarkers( $text ) );
+       }
+
+       public function provideTidySectionMarkers() {
+               $start = ExtractFormatter::SECTION_MARKER_START;
+               $start = $start;
+               $end = ExtractFormatter::SECTION_MARKER_END;
+               $broken = $start . '2' . $end;
+               $broken_length = strlen( $broken );
+
+               $expected = 'Lorem Ipsum Baa Baa Sheep';
+               $expected .= "\n\n" . $start . '2' . $end . "Section 1\n" ;
+               $expected .= 'This section is meaningless. The next one is 
broken.';
+               $expected .= "\n\n";  // Always appears before sections
+
+               // It should be able to remove all broken slices
+               $cases = [];
+               for ($len = 0; $len <= $broken_length; $len++) {
+                       $cases[] = [
+                               $expected . substr( $broken, 0, $len ),
+                               $expected
+                       ];
+               }
+
+               // It should not cut away anything that follows
+               $cases[] = [
+                       $expected . $broken . "N",
+                       $expected . $broken . "N"
+               ];
+               $cases[] = [
+                       $expected . $broken . "Not broken at all\n",
+                       $expected . $broken . "Not broken at all\n"
+               ];
+               return $cases;
+       }
 }

-- 
To view, visit https://gerrit.wikimedia.org/r/344791
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I010a79b8dbfe35facc2450b7d148aa7a85716019
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/TextExtracts
Gerrit-Branch: master
Gerrit-Owner: Gerrit Patch Uploader <gerritpatchuploa...@gmail.com>
Gerrit-Reviewer: Artoria2e5 <arthur200...@gmail.com>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to