jenkins-bot has submitted this change and it was merged.

Change subject: Bug 48917: Fix extracts
......................................................................


Bug 48917: Fix extracts

* Removes rules from MobileFormatter
(now handled by WMF-specific config)
See: Iffc54344121fea029d22ba17b68b2b03476ce884

Change-Id: If5a88d0a21b736325dc7a0168427efa344308355
---
M MobileFrontend.php
M includes/MobileFrontend.hooks.php
M includes/api/ApiQueryExtracts.php
A includes/formatters/ExtractFormatter.php
M includes/formatters/HtmlFormatter.php
M includes/formatters/MobileFormatter.php
A tests/ExtractFormatterTest.php
M tests/HtmlFormatterTest.php
8 files changed, 159 insertions(+), 86 deletions(-)

Approvals:
  MaxSem: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/MobileFrontend.php b/MobileFrontend.php
index df2757b..7fc64a7 100644
--- a/MobileFrontend.php
+++ b/MobileFrontend.php
@@ -42,6 +42,7 @@
        'WmlContext' => 'WmlContext',
        'WmlDeviceProperties' => 'DeviceDetection',
 
+       'ExtractFormatter' => 'formatters/ExtractFormatter',
        'HtmlFormatter' => 'formatters/HtmlFormatter',
        'MobileFormatter' => 'formatters/MobileFormatter',
        'MobileFormatterHTML' => 'formatters/MobileFormatterHTML',
diff --git a/includes/MobileFrontend.hooks.php 
b/includes/MobileFrontend.hooks.php
index fbaffe8..0f5b147 100644
--- a/includes/MobileFrontend.hooks.php
+++ b/includes/MobileFrontend.hooks.php
@@ -494,11 +494,12 @@
                $dir = dirname( dirname( __FILE__ ) ) . '/tests';
 
                $files[] = "$dir/ApiParseExtenderTest.php";
-               $files[] = "$dir/MobileContextTest.php";
-               $files[] = "$dir/MobileFrontendTest.php";
                $files[] = "$dir/DeviceDetectionTest.php";
+               $files[] = "$dir/ExtractFormatterTest.php";
                $files[] = "$dir/HtmlFormatterTest.php";
+               $files[] = "$dir/MobileContextTest.php";
                $files[] = "$dir/MobileFormatterTest.php";
+               $files[] = "$dir/MobileFrontendTest.php";
                $files[] = "$dir/modules/MFResourceLoaderModuleTest.php";
 
                // special page tests
diff --git a/includes/api/ApiQueryExtracts.php 
b/includes/api/ApiQueryExtracts.php
index bca3ff8..fe04e19 100644
--- a/includes/api/ApiQueryExtracts.php
+++ b/includes/api/ApiQueryExtracts.php
@@ -1,9 +1,6 @@
 <?php
 
 class ApiQueryExtracts extends ApiQueryBase {
-       const SECTION_MARKER_START = "\1\2";
-       const SECTION_MARKER_END = "\2\1";
-
        /**
         * @var ParserOptions
         */
@@ -143,7 +140,7 @@
 
        private function getFirstSection( $text, $plainText ) {
                if ( $plainText ) {
-                       $regexp = '/^(.*?)(?=' . self::SECTION_MARKER_START . 
')/s';
+                       $regexp = '/^(.*?)(?=' . 
ExtractFormatter::SECTION_MARKER_START . ')/s';
                } else {
                        $regexp = '/^(.*?)(?=<h[1-6]\b)/s';
                }
@@ -299,7 +296,7 @@
 
        private function doSections( $text ) {
                $text = preg_replace_callback(
-                       "/" . self::SECTION_MARKER_START . '(\d)'. 
self::SECTION_MARKER_END . "(.*?)$/m",
+                       "/" . ExtractFormatter::SECTION_MARKER_START . '(\d)'. 
ExtractFormatter::SECTION_MARKER_END . "(.*?)$/m",
                        array( $this, 'sectionCallback' ),
                        $text
                );
@@ -344,7 +341,7 @@
                        'intro' => false,
                        'plaintext' => false,
                        'sectionformat' => array(
-                               ApiBase::PARAM_TYPE => 
ExtractFormatter::$sectionFormats,
+                               ApiBase::PARAM_TYPE => array( 'plain', 'wiki', 
'raw' ),
                                ApiBase::PARAM_DFLT => 'wiki',
                        ),
                        'continue' => array(
@@ -393,60 +390,5 @@
 
        public function getVersion() {
                return __CLASS__ . ': $Id$';
-       }
-}
-
-class ExtractFormatter extends HtmlFormatter {
-       private $plainText;
-       private $sectionFormat;
-
-       public static $sectionFormats = array(
-               'plain',
-               'wiki',
-               'raw',
-       );
-
-       public function __construct( $text, $plainText, $sectionFormat ) {
-               wfProfileIn( __METHOD__ );
-               parent::__construct( HtmlFormatter::wrapHTML( $text ) );
-               $this->plainText = $plainText;
-               $this->sectionFormat = $sectionFormat;
-
-               $this->removeImages();
-               // @fixme: use rules from MobileFormatter?
-               $this->remove( array( 'table', 'div', '.editsection', 
'.mw-editsection', 'sup.reference', 'span.coordinates',
-                       'span.geo-multi-punct', 'span.geo-nondefault', 
'.noexcerpt', '.error' )
-               );
-               if ( $plainText ) {
-                       $this->flattenAllTags();
-               } else {
-                       $this->flatten( array( 'span', 'a' ) );
-               }
-               wfProfileOut( __METHOD__ );
-       }
-
-       public function getText( $dummy = null ) {
-               wfProfileIn( __METHOD__ );
-               $this->filterContent();
-               $text = parent::getText();
-               if ( $this->plainText ) {
-                       $text = html_entity_decode( $text );
-                       $text = str_replace( "\r", "\n", $text ); // for Windows
-                       $text = preg_replace( "/\n{3,}/", "\n\n", $text ); // 
normalise newlines
-               }
-               wfProfileOut( __METHOD__ );
-               return $text;
-       }
-
-       public function onHtmlReady( $html ) {
-               wfProfileIn( __METHOD__ );
-               if ( $this->plainText ) {
-                       $html = preg_replace( '/\s*(<h([1-6])\b)/i',
-                               "\n\n" . ApiQueryExtracts::SECTION_MARKER_START 
. '$2' . ApiQueryExtracts::SECTION_MARKER_END . '$1' ,
-                               $html
-                       );
-               }
-               wfProfileOut( __METHOD__ );
-               return $html;
        }
 }
diff --git a/includes/formatters/ExtractFormatter.php 
b/includes/formatters/ExtractFormatter.php
new file mode 100644
index 0000000..7202956
--- /dev/null
+++ b/includes/formatters/ExtractFormatter.php
@@ -0,0 +1,63 @@
+<?php
+
+/**
+ * Provides text-only or limited-HTML extracts of page HTML
+ */
+class ExtractFormatter extends HtmlFormatter {
+       const SECTION_MARKER_START = "\1\2";
+       const SECTION_MARKER_END = "\2\1";
+
+       private $plainText;
+
+       /**
+        * @param string $text: Text to convert
+        * @param bool $plainText: Whether extract should be plaintext
+        */
+       public function __construct( $text, $plainText ) {
+               global $wgMFRemovableClasses;
+
+               wfProfileIn( __METHOD__ );
+               parent::__construct( HtmlFormatter::wrapHTML( $text ) );
+               $this->plainText = $plainText;
+
+               $this->removeImages();
+               $this->useImgAlt( false );
+               // @fixme: use rules from MobileFormatter?
+               $this->remove( array( 'table', 'div', '.editsection', 
'.mw-editsection', 'sup.reference', 'span.coordinates',
+                               'span.geo-multi-punct', 'span.geo-nondefault', 
'.noexcerpt', '.error' )
+               );
+               $this->remove( $wgMFRemovableClasses );
+               if ( $plainText ) {
+                       $this->flattenAllTags();
+               } else {
+                       $this->flatten( array( 'span', 'a' ) );
+               }
+               wfProfileOut( __METHOD__ );
+       }
+
+       public function getText( $dummy = null ) {
+               wfProfileIn( __METHOD__ );
+               $this->filterContent();
+               $text = parent::getText();
+               if ( $this->plainText ) {
+                       $text = html_entity_decode( $text );
+                       $text = str_replace( "\xC2\xA0", ' ', $text ); // 
replace nbsp with space
+                       $text = str_replace( "\r", "\n", $text ); // for Windows
+                       $text = preg_replace( "/\n{3,}/", "\n\n", $text ); // 
normalise newlines
+               }
+               wfProfileOut( __METHOD__ );
+               return $text;
+       }
+
+       public function onHtmlReady( $html ) {
+               wfProfileIn( __METHOD__ );
+               if ( $this->plainText ) {
+                       $html = preg_replace( '/\s*(<h([1-6])\b)/i',
+                               "\n\n" . self::SECTION_MARKER_START . '$2' . 
self::SECTION_MARKER_END . '$1' ,
+                               $html
+                       );
+               }
+               wfProfileOut( __METHOD__ );
+               return $html;
+       }
+}
\ No newline at end of file
diff --git a/includes/formatters/HtmlFormatter.php 
b/includes/formatters/HtmlFormatter.php
index 473e159..d52d7d9 100644
--- a/includes/formatters/HtmlFormatter.php
+++ b/includes/formatters/HtmlFormatter.php
@@ -15,6 +15,7 @@
        private $removeImages = false;
        private $idWhitelist = array();
        private $flattenRedLinks = false;
+       private $imgAlt = true;
 
        /**
         * Constructor
@@ -107,6 +108,13 @@
        }
 
        /**
+        * @param bool $value
+        */
+       public function useImgAlt( $value ) {
+               $this->imgAlt = $value;
+       }
+
+       /**
         * Checks whether specified element should not be removed due to 
whitelist
         * @param DOMElement $element: Element to check
         * @return bool
@@ -149,7 +157,7 @@
                        $tagToRemoveNodes = $doc->getElementsByTagName( 
$tagToRemove );
                        foreach ( $tagToRemoveNodes as $tagToRemoveNode ) {
                                if ( $tagToRemoveNode && 
$this->elementNotWhitelisted( $tagToRemoveNode ) ) {
-                                       if ( $tagToRemoveNode->nodeName == 
'img' ) {
+                                       if ( $this->imgAlt && 
$tagToRemoveNode->nodeName == 'img' ) {
                                                $domElemsToReplace[] = 
$tagToRemoveNode;
                                        } else {
                                                $domElemsToRemove[] = 
$tagToRemoveNode;
@@ -171,30 +179,33 @@
                        $domElement->parentNode->replaceChild( $replacement, 
$domElement );
                }
 
-               foreach ( $domElemsToRemove as $domElement ) {
-                       $domElement->parentNode->removeChild( $domElement );
-               }
+               $this->removeElements( $domElemsToRemove );
 
                // Elements with named IDs
+               $domElemsToRemove = array();
                foreach ( $removals['ID'] as $itemToRemove ) {
                        $itemToRemoveNode = $doc->getElementById( $itemToRemove 
);
                        if ( $itemToRemoveNode ) {
-                               $itemToRemoveNode->parentNode->removeChild( 
$itemToRemoveNode );
+                               $domElemsToRemove[] = $itemToRemoveNode;
                        }
                }
+               $this->removeElements( $domElemsToRemove );
 
                // CSS Classes
+               $domElemsToRemove = array();
                $xpath = new DOMXpath( $doc );
                foreach ( $removals['CLASS'] as $classToRemove ) {
-                       $elements = $xpath->query( '//*[@class="' . 
$classToRemove . '"]' );
+                       $elements = $xpath->query( '//*[contains(@class, "' . 
$classToRemove . '")]' );
 
                        /** @var $element DOMElement */
                        foreach ( $elements as $element ) {
-                               if ( $element->parentNode && 
$this->elementNotWhitelisted( $element ) ) {
-                                       $element->parentNode->removeChild( 
$element );
+                               $classes = $element->getAttribute( 'class' );
+                               if ( preg_match( "/\b$classToRemove\b/", 
$classes ) && $element->parentNode && $this->elementNotWhitelisted( $element ) 
) {
+                                       $domElemsToRemove[] = $element;
                                }
                        }
                }
+               $this->removeElements( $domElemsToRemove );
 
                // Tags with CSS Classes
                foreach ( $removals['TAG_CLASS'] as $classToRemove ) {
@@ -236,6 +247,17 @@
        }
 
        /**
+        * Removes a list of elelments from DOMDocument
+        * @param array $elements
+        */
+       private function removeElements( array $elements ) {
+               /** @var $element DOMElement */
+               foreach ( $elements as $element ) {
+                       $element->parentNode->removeChild( $element );
+               }
+       }
+
+       /**
         * libxml in its usual pointlessness converts many chars to entities - 
this function
         * perfoms a reverse conversion
         * @param string $html
diff --git a/includes/formatters/MobileFormatter.php 
b/includes/formatters/MobileFormatter.php
index 4cf1a3c..ce94d8d 100644
--- a/includes/formatters/MobileFormatter.php
+++ b/includes/formatters/MobileFormatter.php
@@ -17,17 +17,9 @@
 
        private $defaultItemsToRemove = array(
                '.toc',
-               'div.stub',
                '#search', // remove search form element from Special:Search
-               'div.sister-project',
                'div.magnify',
                'span.t',
-               '.portal',
-               '#protected-icon',
-               '.boilerplate',
-               '#id-articulo-destacado', // FA star on eswiki, @todo: remove 
class="metadata topicon" instead
-               '.hiddenStructure',
-               '.medialist',
                '.mw-search-createlink',
                '#ogg_player_1',
                '#ogg_player_2',
diff --git a/tests/ExtractFormatterTest.php b/tests/ExtractFormatterTest.php
new file mode 100644
index 0000000..8dd73cf
--- /dev/null
+++ b/tests/ExtractFormatterTest.php
@@ -0,0 +1,38 @@
+<?php
+
+/**
+ * @group MobileFrontend
+ * @group Broken
+ * Disabled for now due to Jenkins weirdness
+ */
+class MF_ExtractFormatterTest extends MediaWikiTestCase {
+       /**
+        * @dataProvider provideExtracts
+        */
+       public function testExtracts( $expected, $wikiText, $plainText ) {
+               $title = Title::newFromText( 'Test' );
+               $po = new ParserOptions();
+               $po->setEditSection( true );
+               $parser = new Parser();
+               $text = $parser->parse( $wikiText, $title, $po )->getText();
+               $fmt = new ExtractFormatter( $text, $plainText );
+               $fmt->remove( '.metadata' ); // Will be added via 
$wgMFRemovableClasses on WMF
+               $text = trim( $fmt->getText() );
+               $this->assertEquals( $expected, $text );
+       }
+
+       public function provideExtracts() {
+               $dutch = "'''Dutch''' (<span class=\"unicode haudio\" 
style=\"white-space:nowrap;\"><span class=\"fn\">"
+                       . 
"[[File:Loudspeaker.svg|11px|link=File:nl-Nederlands.ogg|About this 
sound]]&nbsp;[[:Media:nl-Nederlands.ogg|''Nederlands'']]"
+                       . "</span>&nbsp;<small class=\"metadata audiolinkinfo\" 
style=\"cursor:help;\">([[Wikipedia:Media help|<span style=\"cursor:help;\">"
+                       . "help</span>]]ยท[[:File:nl-Nederlands.ogg|<span 
style=\"cursor:help;\">info</span>]])</small></span>) is a"
+                       . " [[West Germanic languages|West Germanic language]] 
and the native language of most of the population of the [[Netherlands]]";
+               return array(
+                       array(
+                               "Dutch ( Nederlands ) is a West Germanic 
language and the native language of most of the population of the Netherlands",
+                               $dutch,
+                               true,
+                       ),
+               );
+       }
+}
\ No newline at end of file
diff --git a/tests/HtmlFormatterTest.php b/tests/HtmlFormatterTest.php
index b5fa9db..a9a141b 100644
--- a/tests/HtmlFormatterTest.php
+++ b/tests/HtmlFormatterTest.php
@@ -25,8 +25,12 @@
        }
 
        public function getHtmlData() {
-               $disableImages = function( HtmlFormatter $f ) {
+               $removeImages = function( HtmlFormatter $f ) {
                        $f->removeImages();
+               };
+               $fullyRemoveImages = function( HtmlFormatter $f ) {
+                       $f->removeImages();
+                       $f->useImgAlt( false );
                };
                $removeTags = function( HtmlFormatter $f ) {
                        $f->remove( array( 'table', '.foo', '#bar', 'div.baz' ) 
);
@@ -43,11 +47,21 @@
                        array(
                                '<img src="/foo/bar.jpg">Blah</img>',
                                '<span class="mw-mf-image-replacement">['. 
wfMessage( 'mobile-frontend-missing-image' ) .']</span>Blah',
-                               $disableImages,
+                               $removeImages,
+                       ),
+                       array(
+                               '<img src="/foo/bar.jpg" alt="Blah"/>',
+                               '<span 
class="mw-mf-image-replacement">[Blah]</span>',
+                               $removeImages,
+                       ),
+                       array(
+                               '<img src="/foo/bar.jpg" alt="Blah"/>',
+                               '',
+                               $fullyRemoveImages,
                        ),
                        // basic tag removal
                        array(
-                               '<table><tr><td>foo</td></tr></table><div 
class="foo">foo</div><span id="bar">bar</span>
+                               '<table><tr><td>foo</td></tr></table><div 
class="foo">foo</div><div class="foo quux">foo</div><span id="bar">bar</span>
 <strong class="foo" id="bar">foobar</strong><div class="notfoo">test</div><div 
class="baz"/>
 <span class="baz">baz</span> <span class="foo" id="jedi">jedi</span>',
 
@@ -75,23 +89,23 @@
                        array(
                                '<img alt="picture of kitty" src="kitty.jpg">',
                                '<span class="mw-mf-image-replacement">[picture 
of kitty]</span>',
-                               $disableImages,
+                               $removeImages,
                        ),
                        array(
                                '<img src="kitty.jpg">',
                                '<span class="mw-mf-image-replacement">[' . 
wfMessage( 'mobile-frontend-missing-image' ) . ']</span>',
-                               $disableImages,
+                               $removeImages,
                        ),
                        array(
                                '<img alt src="kitty.jpg">',
                                '<span class="mw-mf-image-replacement">[' . 
wfMessage( 'mobile-frontend-missing-image' ) . ']</span>',
-                               $disableImages,
+                               $removeImages,
                        ),
                        array(
                                '<img alt src="kitty.jpg">look at the cute 
kitty!<img alt="picture of angry dog" src="dog.jpg">',
                                '<span class="mw-mf-image-replacement">[' . 
wfMessage( 'mobile-frontend-missing-image' ) . ']</span>look at the cute 
kitty!'.
                                        '<span 
class="mw-mf-image-replacement">[picture of angry dog]</span>',
-                               $disableImages,
+                               $removeImages,
                        )
                );
        }

-- 
To view, visit https://gerrit.wikimedia.org/r/69585
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: If5a88d0a21b736325dc7a0168427efa344308355
Gerrit-PatchSet: 10
Gerrit-Project: mediawiki/extensions/MobileFrontend
Gerrit-Branch: master
Gerrit-Owner: MaxSem <[email protected]>
Gerrit-Reviewer: Jdlrobson <[email protected]>
Gerrit-Reviewer: Kaldari <[email protected]>
Gerrit-Reviewer: MaxSem <[email protected]>
Gerrit-Reviewer: awjrichards <[email protected]>
Gerrit-Reviewer: jenkins-bot

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to