jenkins-bot has submitted this change and it was merged.
Change subject: Bug 48917: Fix extracts
......................................................................
Bug 48917: Fix extracts
* Removes rules from MobileFormatter
(now handled by WMF-specific config)
See: Iffc54344121fea029d22ba17b68b2b03476ce884
Change-Id: If5a88d0a21b736325dc7a0168427efa344308355
---
M MobileFrontend.php
M includes/MobileFrontend.hooks.php
M includes/api/ApiQueryExtracts.php
A includes/formatters/ExtractFormatter.php
M includes/formatters/HtmlFormatter.php
M includes/formatters/MobileFormatter.php
A tests/ExtractFormatterTest.php
M tests/HtmlFormatterTest.php
8 files changed, 159 insertions(+), 86 deletions(-)
Approvals:
MaxSem: Looks good to me, approved
jenkins-bot: Verified
diff --git a/MobileFrontend.php b/MobileFrontend.php
index df2757b..7fc64a7 100644
--- a/MobileFrontend.php
+++ b/MobileFrontend.php
@@ -42,6 +42,7 @@
'WmlContext' => 'WmlContext',
'WmlDeviceProperties' => 'DeviceDetection',
+ 'ExtractFormatter' => 'formatters/ExtractFormatter',
'HtmlFormatter' => 'formatters/HtmlFormatter',
'MobileFormatter' => 'formatters/MobileFormatter',
'MobileFormatterHTML' => 'formatters/MobileFormatterHTML',
diff --git a/includes/MobileFrontend.hooks.php
b/includes/MobileFrontend.hooks.php
index fbaffe8..0f5b147 100644
--- a/includes/MobileFrontend.hooks.php
+++ b/includes/MobileFrontend.hooks.php
@@ -494,11 +494,12 @@
$dir = dirname( dirname( __FILE__ ) ) . '/tests';
$files[] = "$dir/ApiParseExtenderTest.php";
- $files[] = "$dir/MobileContextTest.php";
- $files[] = "$dir/MobileFrontendTest.php";
$files[] = "$dir/DeviceDetectionTest.php";
+ $files[] = "$dir/ExtractFormatterTest.php";
$files[] = "$dir/HtmlFormatterTest.php";
+ $files[] = "$dir/MobileContextTest.php";
$files[] = "$dir/MobileFormatterTest.php";
+ $files[] = "$dir/MobileFrontendTest.php";
$files[] = "$dir/modules/MFResourceLoaderModuleTest.php";
// special page tests
diff --git a/includes/api/ApiQueryExtracts.php
b/includes/api/ApiQueryExtracts.php
index bca3ff8..fe04e19 100644
--- a/includes/api/ApiQueryExtracts.php
+++ b/includes/api/ApiQueryExtracts.php
@@ -1,9 +1,6 @@
<?php
class ApiQueryExtracts extends ApiQueryBase {
- const SECTION_MARKER_START = "\1\2";
- const SECTION_MARKER_END = "\2\1";
-
/**
* @var ParserOptions
*/
@@ -143,7 +140,7 @@
private function getFirstSection( $text, $plainText ) {
if ( $plainText ) {
- $regexp = '/^(.*?)(?=' . self::SECTION_MARKER_START .
')/s';
+ $regexp = '/^(.*?)(?=' .
ExtractFormatter::SECTION_MARKER_START . ')/s';
} else {
$regexp = '/^(.*?)(?=<h[1-6]\b)/s';
}
@@ -299,7 +296,7 @@
private function doSections( $text ) {
$text = preg_replace_callback(
- "/" . self::SECTION_MARKER_START . '(\d)'.
self::SECTION_MARKER_END . "(.*?)$/m",
+ "/" . ExtractFormatter::SECTION_MARKER_START . '(\d)'.
ExtractFormatter::SECTION_MARKER_END . "(.*?)$/m",
array( $this, 'sectionCallback' ),
$text
);
@@ -344,7 +341,7 @@
'intro' => false,
'plaintext' => false,
'sectionformat' => array(
- ApiBase::PARAM_TYPE =>
ExtractFormatter::$sectionFormats,
+ ApiBase::PARAM_TYPE => array( 'plain', 'wiki',
'raw' ),
ApiBase::PARAM_DFLT => 'wiki',
),
'continue' => array(
@@ -393,60 +390,5 @@
public function getVersion() {
return __CLASS__ . ': $Id$';
- }
-}
-
-class ExtractFormatter extends HtmlFormatter {
- private $plainText;
- private $sectionFormat;
-
- public static $sectionFormats = array(
- 'plain',
- 'wiki',
- 'raw',
- );
-
- public function __construct( $text, $plainText, $sectionFormat ) {
- wfProfileIn( __METHOD__ );
- parent::__construct( HtmlFormatter::wrapHTML( $text ) );
- $this->plainText = $plainText;
- $this->sectionFormat = $sectionFormat;
-
- $this->removeImages();
- // @fixme: use rules from MobileFormatter?
- $this->remove( array( 'table', 'div', '.editsection',
'.mw-editsection', 'sup.reference', 'span.coordinates',
- 'span.geo-multi-punct', 'span.geo-nondefault',
'.noexcerpt', '.error' )
- );
- if ( $plainText ) {
- $this->flattenAllTags();
- } else {
- $this->flatten( array( 'span', 'a' ) );
- }
- wfProfileOut( __METHOD__ );
- }
-
- public function getText( $dummy = null ) {
- wfProfileIn( __METHOD__ );
- $this->filterContent();
- $text = parent::getText();
- if ( $this->plainText ) {
- $text = html_entity_decode( $text );
- $text = str_replace( "\r", "\n", $text ); // for Windows
- $text = preg_replace( "/\n{3,}/", "\n\n", $text ); //
normalise newlines
- }
- wfProfileOut( __METHOD__ );
- return $text;
- }
-
- public function onHtmlReady( $html ) {
- wfProfileIn( __METHOD__ );
- if ( $this->plainText ) {
- $html = preg_replace( '/\s*(<h([1-6])\b)/i',
- "\n\n" . ApiQueryExtracts::SECTION_MARKER_START
. '$2' . ApiQueryExtracts::SECTION_MARKER_END . '$1' ,
- $html
- );
- }
- wfProfileOut( __METHOD__ );
- return $html;
}
}
diff --git a/includes/formatters/ExtractFormatter.php
b/includes/formatters/ExtractFormatter.php
new file mode 100644
index 0000000..7202956
--- /dev/null
+++ b/includes/formatters/ExtractFormatter.php
@@ -0,0 +1,63 @@
+<?php
+
+/**
+ * Provides text-only or limited-HTML extracts of page HTML
+ */
+class ExtractFormatter extends HtmlFormatter {
+ const SECTION_MARKER_START = "\1\2";
+ const SECTION_MARKER_END = "\2\1";
+
+ private $plainText;
+
+ /**
+ * @param string $text: Text to convert
+ * @param bool $plainText: Whether extract should be plaintext
+ */
+ public function __construct( $text, $plainText ) {
+ global $wgMFRemovableClasses;
+
+ wfProfileIn( __METHOD__ );
+ parent::__construct( HtmlFormatter::wrapHTML( $text ) );
+ $this->plainText = $plainText;
+
+ $this->removeImages();
+ $this->useImgAlt( false );
+ // @fixme: use rules from MobileFormatter?
+ $this->remove( array( 'table', 'div', '.editsection',
'.mw-editsection', 'sup.reference', 'span.coordinates',
+ 'span.geo-multi-punct', 'span.geo-nondefault',
'.noexcerpt', '.error' )
+ );
+ $this->remove( $wgMFRemovableClasses );
+ if ( $plainText ) {
+ $this->flattenAllTags();
+ } else {
+ $this->flatten( array( 'span', 'a' ) );
+ }
+ wfProfileOut( __METHOD__ );
+ }
+
+ public function getText( $dummy = null ) {
+ wfProfileIn( __METHOD__ );
+ $this->filterContent();
+ $text = parent::getText();
+ if ( $this->plainText ) {
+ $text = html_entity_decode( $text );
+ $text = str_replace( "\xC2\xA0", ' ', $text ); //
replace nbsp with space
+ $text = str_replace( "\r", "\n", $text ); // for Windows
+ $text = preg_replace( "/\n{3,}/", "\n\n", $text ); //
normalise newlines
+ }
+ wfProfileOut( __METHOD__ );
+ return $text;
+ }
+
+ public function onHtmlReady( $html ) {
+ wfProfileIn( __METHOD__ );
+ if ( $this->plainText ) {
+ $html = preg_replace( '/\s*(<h([1-6])\b)/i',
+ "\n\n" . self::SECTION_MARKER_START . '$2' .
self::SECTION_MARKER_END . '$1' ,
+ $html
+ );
+ }
+ wfProfileOut( __METHOD__ );
+ return $html;
+ }
+}
\ No newline at end of file
diff --git a/includes/formatters/HtmlFormatter.php
b/includes/formatters/HtmlFormatter.php
index 473e159..d52d7d9 100644
--- a/includes/formatters/HtmlFormatter.php
+++ b/includes/formatters/HtmlFormatter.php
@@ -15,6 +15,7 @@
private $removeImages = false;
private $idWhitelist = array();
private $flattenRedLinks = false;
+ private $imgAlt = true;
/**
* Constructor
@@ -107,6 +108,13 @@
}
/**
+ * @param bool $value
+ */
+ public function useImgAlt( $value ) {
+ $this->imgAlt = $value;
+ }
+
+ /**
* Checks whether specified element should not be removed due to
whitelist
* @param DOMElement $element: Element to check
* @return bool
@@ -149,7 +157,7 @@
$tagToRemoveNodes = $doc->getElementsByTagName(
$tagToRemove );
foreach ( $tagToRemoveNodes as $tagToRemoveNode ) {
if ( $tagToRemoveNode &&
$this->elementNotWhitelisted( $tagToRemoveNode ) ) {
- if ( $tagToRemoveNode->nodeName ==
'img' ) {
+ if ( $this->imgAlt &&
$tagToRemoveNode->nodeName == 'img' ) {
$domElemsToReplace[] =
$tagToRemoveNode;
} else {
$domElemsToRemove[] =
$tagToRemoveNode;
@@ -171,30 +179,33 @@
$domElement->parentNode->replaceChild( $replacement,
$domElement );
}
- foreach ( $domElemsToRemove as $domElement ) {
- $domElement->parentNode->removeChild( $domElement );
- }
+ $this->removeElements( $domElemsToRemove );
// Elements with named IDs
+ $domElemsToRemove = array();
foreach ( $removals['ID'] as $itemToRemove ) {
$itemToRemoveNode = $doc->getElementById( $itemToRemove
);
if ( $itemToRemoveNode ) {
- $itemToRemoveNode->parentNode->removeChild(
$itemToRemoveNode );
+ $domElemsToRemove[] = $itemToRemoveNode;
}
}
+ $this->removeElements( $domElemsToRemove );
// CSS Classes
+ $domElemsToRemove = array();
$xpath = new DOMXpath( $doc );
foreach ( $removals['CLASS'] as $classToRemove ) {
- $elements = $xpath->query( '//*[@class="' .
$classToRemove . '"]' );
+ $elements = $xpath->query( '//*[contains(@class, "' .
$classToRemove . '")]' );
/** @var $element DOMElement */
foreach ( $elements as $element ) {
- if ( $element->parentNode &&
$this->elementNotWhitelisted( $element ) ) {
- $element->parentNode->removeChild(
$element );
+ $classes = $element->getAttribute( 'class' );
+ if ( preg_match( "/\b$classToRemove\b/",
$classes ) && $element->parentNode && $this->elementNotWhitelisted( $element )
) {
+ $domElemsToRemove[] = $element;
}
}
}
+ $this->removeElements( $domElemsToRemove );
// Tags with CSS Classes
foreach ( $removals['TAG_CLASS'] as $classToRemove ) {
@@ -236,6 +247,17 @@
}
/**
+ * Removes a list of elelments from DOMDocument
+ * @param array $elements
+ */
+ private function removeElements( array $elements ) {
+ /** @var $element DOMElement */
+ foreach ( $elements as $element ) {
+ $element->parentNode->removeChild( $element );
+ }
+ }
+
+ /**
* libxml in its usual pointlessness converts many chars to entities -
this function
* perfoms a reverse conversion
* @param string $html
diff --git a/includes/formatters/MobileFormatter.php
b/includes/formatters/MobileFormatter.php
index 4cf1a3c..ce94d8d 100644
--- a/includes/formatters/MobileFormatter.php
+++ b/includes/formatters/MobileFormatter.php
@@ -17,17 +17,9 @@
private $defaultItemsToRemove = array(
'.toc',
- 'div.stub',
'#search', // remove search form element from Special:Search
- 'div.sister-project',
'div.magnify',
'span.t',
- '.portal',
- '#protected-icon',
- '.boilerplate',
- '#id-articulo-destacado', // FA star on eswiki, @todo: remove
class="metadata topicon" instead
- '.hiddenStructure',
- '.medialist',
'.mw-search-createlink',
'#ogg_player_1',
'#ogg_player_2',
diff --git a/tests/ExtractFormatterTest.php b/tests/ExtractFormatterTest.php
new file mode 100644
index 0000000..8dd73cf
--- /dev/null
+++ b/tests/ExtractFormatterTest.php
@@ -0,0 +1,38 @@
+<?php
+
+/**
+ * @group MobileFrontend
+ * @group Broken
+ * Disabled for now due to Jenkins weirdness
+ */
+class MF_ExtractFormatterTest extends MediaWikiTestCase {
+ /**
+ * @dataProvider provideExtracts
+ */
+ public function testExtracts( $expected, $wikiText, $plainText ) {
+ $title = Title::newFromText( 'Test' );
+ $po = new ParserOptions();
+ $po->setEditSection( true );
+ $parser = new Parser();
+ $text = $parser->parse( $wikiText, $title, $po )->getText();
+ $fmt = new ExtractFormatter( $text, $plainText );
+ $fmt->remove( '.metadata' ); // Will be added via
$wgMFRemovableClasses on WMF
+ $text = trim( $fmt->getText() );
+ $this->assertEquals( $expected, $text );
+ }
+
+ public function provideExtracts() {
+ $dutch = "'''Dutch''' (<span class=\"unicode haudio\"
style=\"white-space:nowrap;\"><span class=\"fn\">"
+ .
"[[File:Loudspeaker.svg|11px|link=File:nl-Nederlands.ogg|About this
sound]] [[:Media:nl-Nederlands.ogg|''Nederlands'']]"
+ . "</span> <small class=\"metadata audiolinkinfo\"
style=\"cursor:help;\">([[Wikipedia:Media help|<span style=\"cursor:help;\">"
+ . "help</span>]]ยท[[:File:nl-Nederlands.ogg|<span
style=\"cursor:help;\">info</span>]])</small></span>) is a"
+ . " [[West Germanic languages|West Germanic language]]
and the native language of most of the population of the [[Netherlands]]";
+ return array(
+ array(
+ "Dutch ( Nederlands ) is a West Germanic
language and the native language of most of the population of the Netherlands",
+ $dutch,
+ true,
+ ),
+ );
+ }
+}
\ No newline at end of file
diff --git a/tests/HtmlFormatterTest.php b/tests/HtmlFormatterTest.php
index b5fa9db..a9a141b 100644
--- a/tests/HtmlFormatterTest.php
+++ b/tests/HtmlFormatterTest.php
@@ -25,8 +25,12 @@
}
public function getHtmlData() {
- $disableImages = function( HtmlFormatter $f ) {
+ $removeImages = function( HtmlFormatter $f ) {
$f->removeImages();
+ };
+ $fullyRemoveImages = function( HtmlFormatter $f ) {
+ $f->removeImages();
+ $f->useImgAlt( false );
};
$removeTags = function( HtmlFormatter $f ) {
$f->remove( array( 'table', '.foo', '#bar', 'div.baz' )
);
@@ -43,11 +47,21 @@
array(
'<img src="/foo/bar.jpg">Blah</img>',
'<span class="mw-mf-image-replacement">['.
wfMessage( 'mobile-frontend-missing-image' ) .']</span>Blah',
- $disableImages,
+ $removeImages,
+ ),
+ array(
+ '<img src="/foo/bar.jpg" alt="Blah"/>',
+ '<span
class="mw-mf-image-replacement">[Blah]</span>',
+ $removeImages,
+ ),
+ array(
+ '<img src="/foo/bar.jpg" alt="Blah"/>',
+ '',
+ $fullyRemoveImages,
),
// basic tag removal
array(
- '<table><tr><td>foo</td></tr></table><div
class="foo">foo</div><span id="bar">bar</span>
+ '<table><tr><td>foo</td></tr></table><div
class="foo">foo</div><div class="foo quux">foo</div><span id="bar">bar</span>
<strong class="foo" id="bar">foobar</strong><div class="notfoo">test</div><div
class="baz"/>
<span class="baz">baz</span> <span class="foo" id="jedi">jedi</span>',
@@ -75,23 +89,23 @@
array(
'<img alt="picture of kitty" src="kitty.jpg">',
'<span class="mw-mf-image-replacement">[picture
of kitty]</span>',
- $disableImages,
+ $removeImages,
),
array(
'<img src="kitty.jpg">',
'<span class="mw-mf-image-replacement">[' .
wfMessage( 'mobile-frontend-missing-image' ) . ']</span>',
- $disableImages,
+ $removeImages,
),
array(
'<img alt src="kitty.jpg">',
'<span class="mw-mf-image-replacement">[' .
wfMessage( 'mobile-frontend-missing-image' ) . ']</span>',
- $disableImages,
+ $removeImages,
),
array(
'<img alt src="kitty.jpg">look at the cute
kitty!<img alt="picture of angry dog" src="dog.jpg">',
'<span class="mw-mf-image-replacement">[' .
wfMessage( 'mobile-frontend-missing-image' ) . ']</span>look at the cute
kitty!'.
'<span
class="mw-mf-image-replacement">[picture of angry dog]</span>',
- $disableImages,
+ $removeImages,
)
);
}
--
To view, visit https://gerrit.wikimedia.org/r/69585
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: If5a88d0a21b736325dc7a0168427efa344308355
Gerrit-PatchSet: 10
Gerrit-Project: mediawiki/extensions/MobileFrontend
Gerrit-Branch: master
Gerrit-Owner: MaxSem <[email protected]>
Gerrit-Reviewer: Jdlrobson <[email protected]>
Gerrit-Reviewer: Kaldari <[email protected]>
Gerrit-Reviewer: MaxSem <[email protected]>
Gerrit-Reviewer: awjrichards <[email protected]>
Gerrit-Reviewer: jenkins-bot
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits