Gergő Tisza has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/361453 )
Change subject: [WIP] Concatenate pages and send to Electron
......................................................................
[WIP] Concatenate pages and send to Electron
Change-Id: I686736e0b2c9f98a7c37046336219a0f852179b0
---
M Collection.alias.php
M Collection.hooks.php
M Collection.php
A SpecialRenderBook.php
A includes/BookRenderer.php
A includes/DataProvider.php
A includes/RemexCollectionMunger.php
A modules/offline.css
8 files changed, 817 insertions(+), 0 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Collection
refs/changes/53/361453/1
diff --git a/Collection.alias.php b/Collection.alias.php
index 770ffc6..28c961e 100644
--- a/Collection.alias.php
+++ b/Collection.alias.php
@@ -11,6 +11,7 @@
/** English (English) */
$specialPageAliases['en'] = [
'Book' => [ 'Book', 'Collection' ],
+ 'RenderBook' => [ 'RenderBook' ],
];
/** Afrikaans (Afrikaans) */
diff --git a/Collection.hooks.php b/Collection.hooks.php
index 797e87f..65103dd 100644
--- a/Collection.hooks.php
+++ b/Collection.hooks.php
@@ -31,6 +31,15 @@
}
/**
+ * Force the invisible skin on raw book views.
+ * @param RequestContext $context
+ * @param Skin $skin
+ */
+ public static function onRequestContextCreateSkin( RequestContext
$context, Skin &$skin ) {
+
+ }
+
+ /**
* Callback for hook SkinBuildSidebar
*
* @param $skin Skin
diff --git a/Collection.php b/Collection.php
index 53cea55..fc989a6 100644
--- a/Collection.php
+++ b/Collection.php
@@ -179,6 +179,14 @@
$wgAutoloadClasses['CollectionSuggest'] = __DIR__ . '/Collection.suggest.php';
$wgAutoloadClasses['CollectionProposals'] = __DIR__ .
'/Collection.suggest.php';
+$wgAutoloadClasses['SpecialRenderBook'] = __DIR__ . '/SpecialRenderBook.php';
+$wgAutoloadClasses[\MediaWiki\Extensions\Collection\DataProvider::class]
+ = __DIR__ . '/includes/DataProvider.php';
+$wgAutoloadClasses[\MediaWiki\Extensions\Collection\BookRenderer::class]
+ = __DIR__ . '/includes/BookRenderer.php';
+$wgAutoloadClasses[MediaWiki\Extensions\Collection\RemexCollectionMunger::class]
+ = __DIR__ . '/includes/RemexCollectionMunger.php';
+
$wgAutoloadClasses['CollectionPageTemplate'] = __DIR__ .
'/templates/CollectionPageTemplate.php';
$wgAutoloadClasses['CollectionListTemplate'] = __DIR__ .
'/templates/CollectionListTemplate.php';
$wgAutoloadClasses['CollectionLoadOverwriteTemplate'] =
@@ -204,6 +212,7 @@
$wgExtensionMessagesFiles['CollectionAlias'] = __DIR__ .
'/Collection.alias.php';
$wgSpecialPages['Book'] = 'SpecialCollection';
+$wgSpecialPages['RenderBook'] = 'SpecialRenderBook';
$wgHooks['SkinTemplateBuildNavUrlsNav_urlsAfterPermalink'][] =
'CollectionHooks::buildNavUrls';
$wgHooks['SidebarBeforeOutput'][] = 'CollectionHooks::buildSidebar';
@@ -248,6 +257,9 @@
'scripts' => 'suggest.js',
'dependencies' => 'ext.collection.bookcreator'
],
+ 'ext.collection.offline' => $collResourceTemplate + [
+ 'styles' => 'offline.css',
+ ],
];
# register global Ajax functions:
diff --git a/SpecialRenderBook.php b/SpecialRenderBook.php
new file mode 100644
index 0000000..f45fa9c
--- /dev/null
+++ b/SpecialRenderBook.php
@@ -0,0 +1,139 @@
+<?php
+
+use MediaWiki\Extensions\Collection\BookRenderer;
+use MediaWiki\Extensions\Collection\DataProvider;
+use MediaWiki\Logger\LoggerFactory;
+use MediaWiki\MediaWikiServices;
+
+function d($x) {
+ \Symfony\Component\VarDumper\VarDumper::dump( $x );
+}
+
+/**
+ * Special page to display a book as a single HTML page.
+ */
+class SpecialRenderBook extends SpecialPage {
+
+ /** @var WANObjectCache */
+ private $htmlCache;
+
+ /** @var VirtualRESTServiceClient */
+ private $restClient;
+
+ public function __construct() {
+ parent::__construct( 'RenderBook' );
+
+ $services = MediaWikiServices::getInstance();
+ $this->htmlCache = $services->getMainWANObjectCache();
+ // $this->restClient = $services->getVirtualRESTServiceClient();
+ // FIXME hack RESTBase not working in local vagrant
+ }
+
+ public function execute( $subPage ) {
+ $out = $this->getOutput();
+
+ switch ( $subPage ) {
+ case 'raw':
+ $context = new DerivativeContext(
$this->getContext() );
+ $context->setSkin( new SkinApi() );
+ $out->setContext( $context );
+ // fall through
+ case 'skinned':
+ $book = $this->getBook();
+ $out->addModuleStyles( [
'ext.collection.offline' ] );
+ $out->addModules( $book['modules'] );
+ $out->addModuleScripts( $book['modulescripts']
);
+ $out->addModuleStyles( $book['modulestyles'] );
+ $out->addJsConfigVars( $book['jsconfigvars'] );
+ $out->addHTML( $book['html'] );
+ return;
+
+ case 'electron':
+ // FIXME should just redirect to /raw instead
and make that cacheable but
+ // need non-session-based storage for that
+ $book = $this->getBook();
+
+ $bookUrl = $this->getPageTitle( 'skinned'
)->getFullURL( [ 'key' => $book['key'] ] );
+ // hack hack
+ $electronUrl =
'http://electron.local.wmftest.net:10241/pdf';
+ $accessKey = 'secret';
+
+ $url = $electronUrl . '?' . wfArrayToCgi( [
+ 'accessKey' => $accessKey,
+ 'url' => $bookUrl,
+ ] );
+ $out->redirect( $url );
+ return;
+
+ default:
+ $options = [
+ 'raw' => 'HTML, raw',
+ 'skinned' => 'HTML, as wiki page',
+ 'electron' => 'PDF, raw',
+ ];
+ $html = Html::openElement( 'ul', [] );
+ foreach ( $options as $mode => $description ) {
+ $linkUrl = $this->getPageTitle( $mode
)->getFullURL();
+ $link = Html::element( 'a', [ 'href' =>
$linkUrl ], $description );
+ $html .= Html::rawElement( 'li', [],
$link );
+ }
+ $html .= Html::closeElement( 'ul' );
+ $out->addHTML( $html );
+ return;
+ }
+ }
+
+ /**
+ * Get the HTML source for the book, from cache or by rendering it.
+ * @return string[] associative array with:
+ * - 'html': HTML, not including <body> or anything outside that.
+ * - 'key': WAN cache key for the HTML
+ */
+ private function getBook() {
+ $key = $this->getRequest()->getText( 'key' );
+ if ( $key ) {
+ $html = $this->htmlCache->get( $key );
+ if ( $html ) {
+ return [ 'key' => $key, 'html' => $html ];
+ }
+ }
+
+ $dataProvider = new DataProvider();
+ $bookRenderer = new BookRenderer();
+
+ $collection = $this->getCollection();
+ $pages = $dataProvider->fetchPages( $collection );
+ $metadata = $dataProvider->fetchMetadata( array_keys( $pages )
);
+ $html = $bookRenderer->renderBook( $collection, $pages,
$metadata );
+
+ // TODO generate the key from the collection metadata instead
+ $key = $this->htmlCache->makeGlobalKey( md5( $html ) );
+ $this->htmlCache->set( $key, $html, 300 );
+
+ return [
+ 'key' => $key,
+ 'html' => $html,
+ 'modules' => $metadata['modules'],
+ 'modulescripts' => $metadata['modulescripts'],
+ 'modulestyles' => $metadata['modulestyles'],
+ 'jsconfigvars' => $metadata['jsconfigvars'],
+ ];
+ }
+
+ /**
+ * Returns the current collection.
+ * @return array[] Collection, as returned by
CollectionSession::getCollection().
+ * @throws ErrorPageError When there is no active connection.
+ */
+ private function getCollection() {
+ if ( !CollectionSession::hasSession() ) {
+ CollectionSession::startSession();
+ }
+ $collection = CollectionSession::getCollection();
+ if ( !$collection || !$collection['enabled'] ||
!$collection['items'] ) {
+ throw new ErrorPageError( 'collection-error-title',
'collection-error-no-session' );
+ }
+ return $collection;
+ }
+
+}
diff --git a/includes/BookRenderer.php b/includes/BookRenderer.php
new file mode 100644
index 0000000..adf3ce4
--- /dev/null
+++ b/includes/BookRenderer.php
@@ -0,0 +1,177 @@
+<?php
+
+namespace MediaWiki\Extensions\Collection;
+
+use Html;
+use LogicException;
+use Sanitizer;
+use Title;
+
+/**
+ * Renders HTML view of a book by concatenating and transforming HTML and
generating some
+ * leading/trailing pages.
+ */
+class BookRenderer {
+
+ /**
+ * Generate the concatenated page.
+ * @param array[] $collection Collection, as returned by
CollectionSession::getCollection().
+ * @param string[] $pages Map of prefixed DB key => Parsoid HTML.
+ * @param array[] $metadata Map of prefixed DB key => metadata, as
returned by fetchMetadata().
+ * @return string HTML of the rendered book (without body/head).
+ */
+ public function renderBook( $collection, $pages, $metadata ) {
+ $hasChapters = (bool)array_filter( $collection['items'],
function ( $item ) {
+ return $item['type'] === 'chapter';
+ } );
+
+ $final = '';
+
+ // First we need to render the articles as we can't know the
TOC anchors for sure
+ // until we have resolved id conflicts.
+ // FastFormatter chokes on Parsoid HTML. HtmlFormatter is still
plenty fast anyway.
+ $formatter = new \RemexHtml\Serializer\HtmlFormatter();
+ $serializer = new \RemexHtml\Serializer\Serializer( $formatter
);
+ $munger = new RemexCollectionMunger( $serializer, [
+ 'topHeadingLevel' => $hasChapters ? 3 : 2,
+ ] );
+ $munger->startCollection();
+ foreach ( $collection['items'] as $item ) {
+ if ( $item['type'] === 'chapter' ) {
+ $final .= Html::rawElement( 'h1', [
+ 'id' => 'mw-book-chapter-' .
Sanitizer::escapeId( $item['title'] ),
+ 'class' => 'mw-book-chapter',
+ ], $item['title'] ) . "\n";
+ } elseif( $item['type'] === 'article' ) {
+ $title = Title::newFromText( $item['title'] );
+ $dbkey = $title->getPrefixedDBkey();
+ $html = $this->getBodyContents( $pages[$dbkey]
);
+
+ $final .= Html::rawElement( 'h2', [
+ 'id' => 'mw-book-article-' .
$dbkey,
+ 'class' => 'mw-book-article',
+ ], $metadata['displaytitle'][$dbkey] )
. "\n";
+
+ $munger->startCollectionSection( './' . $dbkey,
$metadata['sections'][$dbkey] );
+ $treeBuilder = new
\RemexHtml\TreeBuilder\TreeBuilder( $munger, [] );
+ $dispatcher = new
\RemexHtml\TreeBuilder\Dispatcher( $treeBuilder );
+ $tokenizer = new
\RemexHtml\Tokenizer\Tokenizer( $dispatcher, $html, [
+ // HTML comes from Parsoid so we can
skip validation
+ 'ignoreErrors' => true,
+ 'ignoreCharRefs' => true,
+ 'ignoreNulls' => true,
+ 'skipPreprocess' => true,
+ ] );
+ $tokenizer->execute( [
+ 'fragmentNamespace' =>
\RemexHtml\HTMLData::NS_HTML,
+ 'fragmentName' => 'body',
+ ] );
+ $final .= Html::openElement( 'article' )
+ . substr(
$serializer->getResult(), 15 ) // strip "<!DOCTYPE html>"
+ . Html::closeElement(
'article' );
+ }
+ }
+ $munger->endCollection();
+
+ $final = $this->renderCoverAndToc( $collection, $metadata )
+ . $final
+ . $this->renderContributors( $metadata );
+ return $final;
+ }
+
+ /**
+ * Generate HTML for book cover page and table of contents.
+ * @param array $collection Collection, as returned by
CollectionSession::getCollection().
+ * @param array[] $metadata Map of prefixed DB key => metadata, as
returned by fetchMetadata().
+ * @return string HTML to prepend to the book.
+ */
+ private function renderCoverAndToc( $collection, $metadata ) {
+ $cover = '';
+ $toc = '';
+
+ $title = $collection['title'];
+ $subtitle = $collection['subtitle'];
+ if ( $title ) {
+ $cover = Html::element( 'h1', [], $title ) . "\n";
+ if ( $subtitle ) {
+ $cover .= Html::element( 'h2', [], $subtitle )
. "\n";
+ }
+ $cover = Html::rawElement( 'header', [], $cover );
+ }
+
+ $outline = [];
+ foreach ( $collection['items'] as $item ) {
+ if ( $item['type'] === 'chapter' ) {
+ $outline[] = [
+ 'text' => $item['title'],
+ 'type' => 'chapter',
+ 'anchor' => 'mw-book-chapter-' .
Sanitizer::escapeId( $item['title'] ),
+ ];
+ } elseif( $item['type'] === 'article' ) {
+ $title = Title::newFromText( $item['title'] );
+ $dbkey = $title->getPrefixedDBkey();
+ $outline[] = [
+ 'text' =>
$metadata['displaytitle'][$dbkey],
+ 'type' => 'article',
+ 'anchor' => 'mw-book-article-' . $dbkey,
+ ];
+ foreach ( $metadata['sections'][$dbkey] as
$section ) {
+ $outline[] = [
+ 'text' => $section['title'],
+ 'type' => 'section',
+ 'level' => $section['level'],
+ 'anchor' => $section['id'],
+ ];
+ }
+ } else {
+ throw new LogicException( 'Unknown collection
item type: ' . $item['type'] );
+ }
+ }
+ $outline[] = [
+ 'text' => 'Contributors',
+ 'type' => 'contributors',
+ 'anchor' => 'mw-book-contributors',
+ ];
+
+ $tocItems = array_map( function ( $item ) {
+ $class = [ 'mw-book-tocitem-type-' . $item['type'] ];
+ if ( $item['type'] === 'section' ) {
+ $class[] = 'level-' . $item['level'];
+ }
+ return Html::rawElement( 'li', [ 'class' => $class ],
+ Html::element( 'a', [ 'href' => '#' .
$item['anchor'] ], $item['text'] ) );
+ }, $outline );
+ $toc = Html::rawElement( 'nav', [ 'class' => 'toc' ],
+ Html::element( 'h1', [], 'Table of Contents' ) . "\n"
+ . Html::rawElement( 'ul', [], implode( "\n", $tocItems
) ) );
+
+ return $cover . $toc;
+ }
+
+ /**
+ * Generate HTML for the list of contributors.
+ * @param array[] $metadata Map of prefixed DB key => metadata, as
returned by fetchMetadata().
+ * @return string HTML to append to the book.
+ */
+ private function renderContributors( $metadata ) {
+ $list = array_map( function ( $name ) {
+ return Html::element( 'li', [], $name );
+ }, array_keys( $metadata['contributors'] ) );
+
+ return Html::element( 'h1', [ 'id' => 'mw-book-contributors' ],
'Contributors' )
+ . Html::rawElement( 'div', [ 'class' =>
'contributors' ],
+ Html::rawElement( 'ul', [], implode( "\n",
$list ) ) );
+ }
+
+ /**
+ * Get the part inside the <body> from an HTML file.
+ * Not very robust (a <body> tag in a comment or CDATA section could
confuse it) but the
+ * <head> section has no user-controlled part so using it with Parsoid
HTML should be fine.
+ * @param string $html
+ * @return string
+ */
+ private function getBodyContents( $html ) {
+ return preg_replace(
'/(^.*?<body\b[^>]*>)|(<\/body>\s*<\/html>\s*$)/si', '', $html );
+ }
+
+}
diff --git a/includes/DataProvider.php b/includes/DataProvider.php
new file mode 100644
index 0000000..d303e68
--- /dev/null
+++ b/includes/DataProvider.php
@@ -0,0 +1,122 @@
+<?php
+
+namespace MediaWiki\Extensions\Collection;
+
+use ApiMain;
+use DerivativeRequest;
+use MediaWiki\Logger\LoggerFactory;
+use MultiHttpClient;
+use RequestContext;
+use Title;
+
+/**
+ * Given a set of titles, fetches article content and various metadata like
authors.
+ */
+class DataProvider {
+
+ /**
+ * Fetch HTML for the pages in a collection.
+ * @param array[] $collection Collection, as returned by
CollectionSession::getCollection().
+ * @return string[] Map of prefixed DB key => Parsoid HTML.
+ */
+ public function fetchPages( $collection ) {
+ $pages = [];
+ $reqs = [];
+ foreach ( $collection['items'] as $item ) {
+ if ( $item['type'] === 'article' ) {
+ // FIXME use VirtualRESTServiceClient
+ $title = Title::newFromText( $item['title'] );
+ $parsoidBaseURI =
'https://en.wikipedia.org/api/rest_v1/page/html/';
+ $url = $parsoidBaseURI .
$title->getPrefixedDBkey();
+ if ( isset( $item['revision'] ) ) {
+ // $url .= '/' . $item['revision'];
+ }
+ $reqs[] = [ 'method' => 'GET', 'url' => $url ];
+ $pages[] = $title->getPrefixedDBkey();
+ }
+ }
+ $client = new MultiHttpClient( [ 'logger' =>
LoggerFactory::getInstance( 'multihttp' ) ] ) ;
+ $resp = $client->runMulti( $reqs );
+ return array_combine( $pages, array_map( function ( $item ) {
+ return !empty( $item['response']['error'] ) ? '' :
$item['response']['body'];
+ }, $resp ) );
+ }
+
+ /**
+ * Fetch metadata (sections, contributors and RL modules) for the pages
in a collection.
+ * @param string[] $dbkeys DB keys of the articles contained in the
book.
+ * @return array
+ * - displaytitle: [ dbkey => title, ... ]
+ * - sections: [ dbkey => [ [ title => ..., id => ..., level => ...
], ... ], ... ]
+ * - contributors: [ name => userid, ... ]
+ * - modules: [ module, ... ]
+ * - modulescripts: [ module, ... ]
+ * - modulestyles: [ module, ... ]
+ * - jsconfigvars: [ var, ... ]
+ */
+ public function fetchMetadata( $dbkeys ) {
+ $metadata = [
+ 'displaytitle' => [],
+ 'sections' => [],
+ 'contributors' => [],
+ 'modules' => [],
+ 'modulescripts' => [],
+ 'modulestyles' => [],
+ 'jsconfigvars' => [],
+ ];
+
+ // get contributors
+ // FIXME probably better to use direct SQL queries
+ $params = [
+ 'format' => 'json',
+ 'action' => 'query',
+ 'prop' => 'contributors',
+ 'redirects' => 1,
+ 'pclimit' => 'max', // 500; more titles than that will
probably blow up Electron anyway
+ 'titles' => implode( '|', $dbkeys ),
+ ];
+ $request = RequestContext::getMain()->getRequest();
+ do {
+ $api = new ApiMain( new DerivativeRequest( $request,
$params ) );
+ $api->execute();
+ $data = $api->getResult()->getResultData( [], [ 'Strip'
=> 'all' ] );
+ $continue = isset( $metadata['continue'] ) ?
$metadata['continue'] : [];
+ $params = $continue + $params;
+ foreach ( $data['query']['pages'] as $page ) {
+ foreach ( $page['contributors'] as $key =>
$contrib ) {
+
$metadata['contributors'][$contrib['name']] = $contrib['userid'];
+ }
+ }
+ } while ( $continue );
+
+ // get sections & modules
+ // TODO collect sections from MCS and ResourceLoader modules
from Parsoid once T69540 is resolved?
+ foreach ( $dbkeys as $dbkey ) {
+ $params = [
+ 'format' => 'json',
+ 'action' => 'parse',
+ 'prop' =>
'sections|displaytitle|modules|jsconfigvars',
+ 'page' => $dbkey,
+ ];
+ $api = new ApiMain( new DerivativeRequest( $request,
$params ) );
+ $api->execute();
+ $data = $api->getResult()->getResultData( [], [ 'Strip'
=> 'all' ] );
+ $metadata['displaytitle'][$dbkey] =
$data['parse']['displaytitle'];
+ $metadata['sections'][$dbkey] = array_map( function (
$sectionData ) {
+ return [
+ 'title' => $sectionData['line'],
+ 'id' => $sectionData['anchor'],
+ 'level' => $sectionData['level'],
+ ];
+ }, $data['parse']['sections'] );
+ foreach ( [ 'modules', 'modulescripts', 'modulestyles',
'jsconfigvars' ] as $field ) {
+ // let's hope there is no conflict in
jsconfigvars...
+ $metadata[$field] = array_merge(
$metadata[$field], $data['parse'][$field] );
+
+ }
+ }
+
+ return $metadata;
+ }
+
+}
diff --git a/includes/RemexCollectionMunger.php
b/includes/RemexCollectionMunger.php
new file mode 100644
index 0000000..30d819a
--- /dev/null
+++ b/includes/RemexCollectionMunger.php
@@ -0,0 +1,335 @@
+<?php
+
+namespace MediaWiki\Extensions\Collection;
+
+use RemexHtml\Serializer\Serializer;
+use RemexHtml\Tokenizer\Attributes;
+use RemexHtml\TreeBuilder\Element;
+use RemexHtml\TreeBuilder\TreeHandler;
+
+/**
+ * DOM tree munger for RemexHtml that makes small adjustments to a HTML
document for including
+ * in a collection (a HTML document that's more or less the concatenation of
multiple original
+ * documents).
+ *
+ * The munger is reused for parsing multiple documents and outputs a single
unified document.
+ * It makes small changes to make the resulting document valid and look good:
+ * - converts h1 to h2 while preserving heading structure
+ * - removes the document name from before self-references
+ * - renames conflicting ids
+ */
+class RemexCollectionMunger implements TreeHandler {
+
+ /**
+ * @var array
+ */
+ private $options;
+
+ /**
+ * Map from original document ID to collection document ID.
+ * A value of false means that the ID is reserved and upon encountering
it a new mapping
+ * to a free id needs to be created. A value of true means the ID is
used (ie. not reserved
+ * but will have to be in the next document).
+ * @var array
+ */
+ private $idMap = [];
+
+ /**
+ * Reference to section data. id and level will be updated to keep in
sync with document changes.
+ * @var array[] [[ title => ..., id => ..., level => ... ], ...]
+ */
+ private $sectionRef;
+
+ /**
+ * 1-based index for the current source document in the list of source
documents.
+ * @var int
+ */
+ private $documentIndex = 0;
+
+ /**
+ * URL for the current document, relative to its base URL. For a
Parsoid document this will
+ * be something like './Title'.
+ * @var string
+ */
+ private $selfLink;
+
+ /**
+ * Tracks how many levels headings need to be moved. E.g. a document
with h1;h2;h3
+ * needs to be transformed to h2;h3;h4 while a document with h2;h3;h1
to h2;h3;h2
+ * so we set $headingDisplacementLevel when encountering h1 and use it
decide what to
+ * do with other headings.
+ * @var int
+ */
+ private $headingDisplacementLevel = 0;
+
+ /**
+ * Source document end position.
+ * @var int
+ */
+ private $endPos;
+
+ /** @var Serializer */
+ private $serializer;
+
+ /**
+ * @param Serializer $serializer
+ * @param array $options
+ * - topHeadingLevel: highest allowed heading level (e.g. '2' means
h1 is disallowed and will
+ * be "pushed down")
+ */
+ public function __construct( Serializer $serializer, $options = [] ) {
+ $this->serializer = $serializer;
+ $this->options = $options + [
+ 'topHeadingLevel' => 2,
+ ];
+ }
+
+ /**
+ * Reset internal state. Needs to be called before parsing a new source
document.
+ * @param string $selfLink URL prefix before # which means this is a
local URL
+ * @param array[] $sections Section data; each section is a triple
+ * [ title => ..., id => ..., level => ... ]. RemexCollectionMunger
will update the id/level
+ * to keep in sync with document changes.
+ */
+ public function startCollectionSection( $selfLink, &$sections ) {
+ $this->documentIndex++;
+ $this->headingDisplacementLevel = 0;
+ // set all mappings to false: they are only valid within a
single source document
+ $this->idMap = array_fill_keys( array_keys( $this->idMap ),
false );
+ $this->sectionRef = &$sections;
+ $this->selfLink = $selfLink;
+ }
+
+ /**
+ * Called by RemexHTML when parsing of a source document starts.
+ * @inheritdoc
+ */
+ public function startDocument( $fragmentNamespace, $fragmentName ) {
+ // This will emit a doctype even if fragment name is set. It
needs to be
+ // removed manually after getting the result from the Formatter.
+ $this->serializer->startDocument( $fragmentNamespace,
$fragmentName );
+ }
+
+ /**
+ * Called by RemexHTML when parsing stops.
+ * @param integer $pos The input string length, i.e. the past-the-end
position.
+ */
+ public function endDocument( $pos ) {
+ $this->endPos = $pos;
+ $this->serializer->endDocument( $this->getPosition( $pos ) );
+ // do nothing - this is not necessarily the end of the output
document.
+ }
+
+ /**
+ * Called by RemexHTML when parsing characters.
+ * @inheritdoc
+ */
+ public function characters(
+ $preposition, $ref, $text, $start, $length, $sourceStart,
$sourceLength
+ ) {
+ $this->serializer->characters( $preposition, $ref, $text,
$start, $length,
+ $this->getPosition( $sourceStart ), $sourceLength );
+ }
+
+ /**
+ * Called by RemexHTML when parsing an element.
+ * @inheritdoc
+ */
+ function insertElement(
+ $preposition, $ref, Element $element, $void, $sourceStart,
$sourceLength
+ ) {
+ // if the serializer has already seen this element, we already
munged it
+ if ( !$element->userData ) {
+ $this->fixHeading( $element );
+ $this->fixId( $element->attrs, $element );
+ }
+ $this->serializer->insertElement( $preposition, $ref, $element,
$void,
+ $this->getPosition( $sourceStart ), $sourceLength );
+ }
+
+ /**
+ * Called by RemexHTML when parsing an end tag.
+ * @inheritdoc
+ */
+ function endTag( Element $element, $sourceStart, $sourceLength ) {
+ $this->serializer->endTag( $element, $this->getPosition(
$sourceStart ), $sourceLength );
+ }
+
+ /**
+ * Called by RemexHTML when parsing a doctype declaration.
+ * @inheritdoc
+ */
+ function doctype( $name, $public, $system, $quirks, $sourceStart,
$sourceLength ) {
+ // we only need the body so no point in forwarding this
+ }
+
+ /**
+ * Called by RemexHTML when parsing a comment.
+ * @inheritdoc
+ */
+ function comment( $preposition, $ref, $text, $sourceStart,
$sourceLength ) {
+ $this->serializer->comment( $preposition, $ref, $text,
+ $this->getPosition( $sourceStart), $sourceLength );
+ }
+
+ /**
+ * Called by RemexHTML on parse errors.
+ * @inheritdoc
+ */
+ function error( $text, $pos ) {
+ $this->serializer->error( $text, $this->getPosition( $pos ) );
+ }
+
+ /**
+ * Called by RemexHTML when updating element attributes.
+ * @inheritdoc
+ */
+ function mergeAttributes( Element $element, Attributes $attrs,
$sourceStart ) {
+ // RemexHTML should only call this method for <html> and <body>
which we discard
+ // so there is probably no need to fix ids but do it anyway
just in case
+ $this->fixId( $attrs, $element );
+ $this->serializer->mergeAttributes( $element, $attrs,
$this->getPosition( $sourceStart ) );
+ }
+
+ /**
+ * Called by RemexHTML in some edge cases when fixing invalid HTML.
+ * @inheritdoc
+ */
+ function removeNode( Element $element, $sourceStart ) {
+ $this->serializer->removeNode( $element, $this->getPosition(
$sourceStart ) );
+ }
+
+ /**
+ * Called by RemexHTML in some edge cases when fixing invalid HTML.
+ * @inheritdoc
+ */
+ function reparentChildren( Element $element, Element $newParent,
$sourceStart ) {
+ $this->serializer->reparentChildren( $element, $newParent,
$this->getPosition( $sourceStart ) );
+ }
+
+ /**
+ * Translate a position in one of the source documents to a position in
the document collection.
+ * This is only used for debugging so we just generate a number which
makes it obvious where
+ * to look in the source documents.
+ * @param $originalSourceStart
+ * @return int
+ */
+ private function getPosition( $originalSourceStart ) {
+ // "concatenate" document index and position within document.
+ // this leaves ~100MB index space for each document which is
plenty, and still fits
+ // comfortably into an int even on 32-bit builds.
+ return $this->documentIndex * 1e8 + $originalSourceStart;
+ }
+
+ /**
+ * Fix $element if it is a heading with the wrong level.
+ * h1 and maybe h2 are reserved for chapter/article titles, if we
encounter any,
+ * force the whole heading structure to be on a lower level.
+ * @param Element $element
+ */
+ private function fixHeading( $element ) {
+ if ( !$this->isHeading( $element ) ) {
+ return;
+ }
+
+ $level = (int)substr( $element->htmlName, 1 );
+ $displace = max( $this->headingDisplacementLevel,
$this->options['topHeadingLevel'] - $level );
+ $this->headingDisplacementLevel = $displace;
+ $newLevel = min( $level + $displace, 6 );
+ if ( $newLevel !== $level ) {
+ // update section data
+ if ( isset( $element->attrs['id'] ) ) {
+ foreach ( $this->sectionRef as $index =>
$section ) {
+ if ( $section['id'] ===
$element->attrs['id'] ) {
+
$this->sectionRef[$index]['level'] = $newLevel;
+ }
+ }
+ }
+ $element->name = $element->htmlName = 'h' . $newLevel;
+ }
+ }
+
+ /**
+ * Fix $element if it has or refers to an id which conflicts with an id
in another document.
+ * Needed to prevent id conflicts (e.g. two documents using the same
section name). Also fix
+ * Parsoid internal references to be #section, not ./Title#section.
+ * @param Attributes $attrs
+ * @param Element $element
+ */
+ private function fixId( $attrs, $element ) {
+ if ( isset( $attrs['id'] ) ) {
+ $newId = $this->getUnreservedId( $attrs['id'] );
+ if ( $newId !== $attrs['id'] ) {
+ // if we renamed a heading anchor, update
section data
+ if ( $this->isHeading( $element ) ) {
+ foreach ( $this->sectionRef as $index
=> $section ) {
+ if ( $section['id'] ===
$attrs['id'] ) {
+
$this->sectionRef[$index]['id'] = $newId;
+ break;
+ }
+ }
+ }
+ $attrs['id'] = $newId;
+ }
+ }
+ // Make sure local references are in sync with ids.
+ // We don't try to update cross-document references, too much
effort.
+ if (
+ $element->htmlName === 'a' && isset( $attrs['href'] )
+ && $this->startsWith( $attrs['href'], $this->selfLink .
'#' )
+ ) {
+ $id = substr( $attrs['href'], strlen( $this->selfLink )
+ 1 );
+ $id = $this->getUnreservedId( $id );
+ $attrs['href'] = '#' . $id;
+ }
+ }
+
+ /**
+ * Get an unreserved id and update the mapping.
+ * Will return $id if it does not conflict with earlier documents;
otherwise it will find
+ * a free name and use that instead, consistently.
+ * @param int $id
+ * @return string
+ */
+ private function getUnreservedId( $id ) {
+ if ( !isset( $this->idMap[$id] ) ) {
+ // No conflict. Mark this id as being in use.
+ $this->idMap[$id] = true;
+ return $id;
+ } elseif ( $this->idMap[$id] === true ) {
+ // This id has been used in the same source document.
That's fine, nothing to do.
+ return $id;
+ } elseif ( $this->idMap[$id] === false ) {
+ // This id has been used in a different source
document, must remap.
+ $n = 2;
+ do {
+ $replacement = $id . '_' . $n++;
+ } while ( isset( $this->idMap[$replacement] ) );
+ $this->idMap[$id] = $replacement;
+ $this->idMap[$replacement] = false;
+ return $replacement;
+ } else {
+ // This id has has already been remapped for the
current source document.
+ return $this->idMap[$id];
+ }
+ }
+
+ /**
+ * Is $element a HTML heading (h1..h6) tag?
+ * @param Element $element
+ * @return bool
+ */
+ private function isHeading( $element ) {
+ return in_array( $element->htmlName, [ 'h1', 'h2', 'h3', 'h4',
'h5', 'h6' ], true );
+ }
+
+ /**
+ * Check for prefix match.
+ * @param string $haystack
+ * @param string $needle
+ * @return bool
+ */
+ private function startsWith( $haystack, $needle ) {
+ return substr_compare( $haystack, $needle, 0, strlen( $needle )
) === 0;
+ }
+}
diff --git a/modules/offline.css b/modules/offline.css
new file mode 100644
index 0000000..2ee9bbf
--- /dev/null
+++ b/modules/offline.css
@@ -0,0 +1,22 @@
+.mw-book-tocitem-type-chapter {
+ font-size: 140%;
+}
+.mw-book-tocitem-type-article {
+ font-size: 120%;
+ padding-left: 5px;
+}
+.mw-book-tocitem-type-section.level-2 {
+ padding-left: 10px;
+}
+.mw-book-tocitem-type-section.level-3 {
+ padding-left: 15px;
+}
+.mw-book-tocitem-type-section.level-4 {
+ padding-left: 20px;
+}
+.mw-book-tocitem-type-section.level-5 {
+ padding-left: 25px;
+}
+.mw-book-tocitem-type-section.level-6 {
+ padding-left: 30px;
+}
--
To view, visit https://gerrit.wikimedia.org/r/361453
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I686736e0b2c9f98a7c37046336219a0f852179b0
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Collection
Gerrit-Branch: master
Gerrit-Owner: Gergő Tisza <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits