[MediaWiki-commits] [Gerrit] mediawiki...Collection[master]: [WIP] Concatenate pages and send to Electron

Code Review Mon, 26 Jun 2017 06:55:21 -0700

Gergő Tisza has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/361453 )


Change subject: [WIP] Concatenate pages and send to Electron
......................................................................

[WIP] Concatenate pages and send to Electron

Change-Id: I686736e0b2c9f98a7c37046336219a0f852179b0
---
M Collection.alias.php
M Collection.hooks.php
M Collection.php
A SpecialRenderBook.php
A includes/BookRenderer.php
A includes/DataProvider.php
A includes/RemexCollectionMunger.php
A modules/offline.css
8 files changed, 817 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Collection 
refs/changes/53/361453/1

diff --git a/Collection.alias.php b/Collection.alias.php
index 770ffc6..28c961e 100644
--- a/Collection.alias.php
+++ b/Collection.alias.php
@@ -11,6 +11,7 @@
 /** English (English) */
 $specialPageAliases['en'] = [
        'Book' => [ 'Book', 'Collection' ],
+       'RenderBook' => [ 'RenderBook' ],
 ];
 
 /** Afrikaans (Afrikaans) */
diff --git a/Collection.hooks.php b/Collection.hooks.php
index 797e87f..65103dd 100644
--- a/Collection.hooks.php
+++ b/Collection.hooks.php
@@ -31,6 +31,15 @@
        }
 
        /**
+        * Force the invisible skin on raw book views.
+        * @param RequestContext $context
+        * @param Skin $skin
+        */
+       public static function onRequestContextCreateSkin( RequestContext 
$context, Skin &$skin ) {
+
+       }
+
+       /**
         * Callback for hook SkinBuildSidebar
         *
         * @param $skin Skin
diff --git a/Collection.php b/Collection.php
index 53cea55..fc989a6 100644
--- a/Collection.php
+++ b/Collection.php
@@ -179,6 +179,14 @@
 $wgAutoloadClasses['CollectionSuggest'] = __DIR__ . '/Collection.suggest.php';
 $wgAutoloadClasses['CollectionProposals'] = __DIR__ . 
'/Collection.suggest.php';
 
+$wgAutoloadClasses['SpecialRenderBook'] = __DIR__ . '/SpecialRenderBook.php';
+$wgAutoloadClasses[\MediaWiki\Extensions\Collection\DataProvider::class]
+       = __DIR__ . '/includes/DataProvider.php';
+$wgAutoloadClasses[\MediaWiki\Extensions\Collection\BookRenderer::class]
+       = __DIR__ . '/includes/BookRenderer.php';
+$wgAutoloadClasses[MediaWiki\Extensions\Collection\RemexCollectionMunger::class]
+       = __DIR__ . '/includes/RemexCollectionMunger.php';
+
 $wgAutoloadClasses['CollectionPageTemplate'] = __DIR__ . 
'/templates/CollectionPageTemplate.php';
 $wgAutoloadClasses['CollectionListTemplate'] = __DIR__ . 
'/templates/CollectionListTemplate.php';
 $wgAutoloadClasses['CollectionLoadOverwriteTemplate'] =
@@ -204,6 +212,7 @@
 $wgExtensionMessagesFiles['CollectionAlias'] = __DIR__ . 
'/Collection.alias.php';
 
 $wgSpecialPages['Book'] = 'SpecialCollection';
+$wgSpecialPages['RenderBook'] = 'SpecialRenderBook';
 
 $wgHooks['SkinTemplateBuildNavUrlsNav_urlsAfterPermalink'][] = 
'CollectionHooks::buildNavUrls';
 $wgHooks['SidebarBeforeOutput'][] = 'CollectionHooks::buildSidebar';
@@ -248,6 +257,9 @@
                'scripts' => 'suggest.js',
                'dependencies' => 'ext.collection.bookcreator'
        ],
+       'ext.collection.offline' => $collResourceTemplate + [
+               'styles' => 'offline.css',
+       ],
 ];
 
 # register global Ajax functions:
diff --git a/SpecialRenderBook.php b/SpecialRenderBook.php
new file mode 100644
index 0000000..f45fa9c
--- /dev/null
+++ b/SpecialRenderBook.php
@@ -0,0 +1,139 @@
+<?php
+
+use MediaWiki\Extensions\Collection\BookRenderer;
+use MediaWiki\Extensions\Collection\DataProvider;
+use MediaWiki\Logger\LoggerFactory;
+use MediaWiki\MediaWikiServices;
+
+function d($x) {
+       \Symfony\Component\VarDumper\VarDumper::dump( $x );
+}
+
+/**
+ * Special page to display a book as a single HTML page.
+ */
+class SpecialRenderBook extends SpecialPage {
+
+       /** @var WANObjectCache */
+       private $htmlCache;
+
+       /** @var VirtualRESTServiceClient */
+       private $restClient;
+
+       public function __construct() {
+               parent::__construct( 'RenderBook' );
+
+               $services = MediaWikiServices::getInstance();
+               $this->htmlCache = $services->getMainWANObjectCache();
+               // $this->restClient = $services->getVirtualRESTServiceClient();
+               // FIXME hack RESTBase not working in local vagrant
+       }
+
+       public function execute( $subPage ) {
+               $out = $this->getOutput();
+
+               switch ( $subPage ) {
+                       case 'raw':
+                               $context = new DerivativeContext( 
$this->getContext() );
+                               $context->setSkin( new SkinApi() );
+                               $out->setContext( $context );
+                               // fall through
+                       case 'skinned':
+                               $book = $this->getBook();
+                               $out->addModuleStyles( [ 
'ext.collection.offline' ] );
+                               $out->addModules( $book['modules'] );
+                               $out->addModuleScripts( $book['modulescripts'] 
);
+                               $out->addModuleStyles( $book['modulestyles'] );
+                               $out->addJsConfigVars( $book['jsconfigvars'] );
+                               $out->addHTML( $book['html'] );
+                               return;
+
+                       case 'electron':
+                               // FIXME should just redirect to /raw instead 
and make that cacheable but
+                               // need non-session-based storage for that
+                               $book = $this->getBook();
+
+                               $bookUrl = $this->getPageTitle( 'skinned' 
)->getFullURL( [ 'key' => $book['key'] ] );
+                               // hack hack
+                               $electronUrl = 
'http://electron.local.wmftest.net:10241/pdf';
+                               $accessKey = 'secret';
+
+                               $url = $electronUrl . '?' . wfArrayToCgi( [
+                                       'accessKey' => $accessKey,
+                                       'url' => $bookUrl,
+                               ] );
+                               $out->redirect( $url );
+                               return;
+
+                       default:
+                               $options = [
+                                       'raw' => 'HTML, raw',
+                                       'skinned' => 'HTML, as wiki page',
+                                       'electron' => 'PDF, raw',
+                               ];
+                               $html = Html::openElement( 'ul', [] );
+                               foreach ( $options as $mode => $description ) {
+                                       $linkUrl = $this->getPageTitle( $mode 
)->getFullURL();
+                                       $link = Html::element( 'a', [ 'href' => 
$linkUrl ], $description );
+                                       $html .= Html::rawElement( 'li', [], 
$link );
+                               }
+                               $html .= Html::closeElement( 'ul' );
+                               $out->addHTML( $html );
+                               return;
+               }
+       }
+
+       /**
+        * Get the HTML source for the book, from cache or by rendering it.
+        * @return string[] associative array with:
+        *    - 'html': HTML, not including <body> or anything outside that.
+        *    - 'key': WAN cache key for the HTML
+        */
+       private function getBook() {
+               $key = $this->getRequest()->getText( 'key' );
+               if ( $key ) {
+                       $html = $this->htmlCache->get( $key );
+                       if ( $html ) {
+                               return [ 'key' => $key, 'html' => $html ];
+                       }
+               }
+
+               $dataProvider = new DataProvider();
+               $bookRenderer = new BookRenderer();
+
+               $collection = $this->getCollection();
+               $pages = $dataProvider->fetchPages( $collection );
+               $metadata = $dataProvider->fetchMetadata( array_keys( $pages ) 
);
+               $html = $bookRenderer->renderBook( $collection, $pages, 
$metadata  );
+
+               // TODO generate the key from the collection metadata instead
+               $key = $this->htmlCache->makeGlobalKey( md5( $html ) );
+               $this->htmlCache->set( $key, $html, 300 );
+
+               return [
+                       'key' => $key,
+                       'html' => $html,
+                       'modules' => $metadata['modules'],
+                       'modulescripts' => $metadata['modulescripts'],
+                       'modulestyles' => $metadata['modulestyles'],
+                       'jsconfigvars' => $metadata['jsconfigvars'],
+               ];
+       }
+
+       /**
+        * Returns the current collection.
+        * @return array[] Collection, as returned by 
CollectionSession::getCollection().
+        * @throws ErrorPageError When there is no active connection.
+        */
+       private function getCollection() {
+               if ( !CollectionSession::hasSession() ) {
+                       CollectionSession::startSession();
+               }
+               $collection = CollectionSession::getCollection();
+               if ( !$collection || !$collection['enabled'] || 
!$collection['items'] ) {
+                       throw new ErrorPageError( 'collection-error-title', 
'collection-error-no-session' );
+               }
+               return $collection;
+       }
+
+}
diff --git a/includes/BookRenderer.php b/includes/BookRenderer.php
new file mode 100644
index 0000000..adf3ce4
--- /dev/null
+++ b/includes/BookRenderer.php
@@ -0,0 +1,177 @@
+<?php
+
+namespace MediaWiki\Extensions\Collection;
+
+use Html;
+use LogicException;
+use Sanitizer;
+use Title;
+
+/**
+ * Renders HTML view of a book by concatenating and transforming HTML and 
generating some
+ * leading/trailing pages.
+ */
+class BookRenderer {
+
+       /**
+        * Generate the concatenated page.
+        * @param array[] $collection Collection, as returned by 
CollectionSession::getCollection().
+        * @param string[] $pages Map of prefixed DB key => Parsoid HTML.
+        * @param array[] $metadata Map of prefixed DB key => metadata, as 
returned by fetchMetadata().
+        * @return string HTML of the rendered book (without body/head).
+        */
+       public function renderBook( $collection, $pages, $metadata ) {
+               $hasChapters = (bool)array_filter( $collection['items'], 
function ( $item ) {
+                       return $item['type'] === 'chapter';
+               } );
+
+               $final = '';
+
+               // First we need to render the articles as we can't know the 
TOC anchors for sure
+               // until we have resolved id conflicts.
+               // FastFormatter chokes on Parsoid HTML. HtmlFormatter is still 
plenty fast anyway.
+               $formatter = new \RemexHtml\Serializer\HtmlFormatter();
+               $serializer = new \RemexHtml\Serializer\Serializer( $formatter 
);
+               $munger = new RemexCollectionMunger( $serializer, [
+                       'topHeadingLevel' => $hasChapters ? 3 : 2,
+               ] );
+               $munger->startCollection();
+               foreach ( $collection['items'] as $item ) {
+                       if ( $item['type'] === 'chapter' ) {
+                               $final .= Html::rawElement( 'h1', [
+                                               'id' => 'mw-book-chapter-' . 
Sanitizer::escapeId( $item['title'] ),
+                                               'class' => 'mw-book-chapter',
+                                       ], $item['title'] ) . "\n";
+                       } elseif( $item['type'] === 'article' ) {
+                               $title = Title::newFromText( $item['title'] );
+                               $dbkey = $title->getPrefixedDBkey();
+                               $html = $this->getBodyContents( $pages[$dbkey] 
);
+
+                               $final .= Html::rawElement( 'h2', [
+                                               'id' => 'mw-book-article-' . 
$dbkey,
+                                               'class' => 'mw-book-article',
+                                       ], $metadata['displaytitle'][$dbkey] ) 
. "\n";
+
+                               $munger->startCollectionSection( './' . $dbkey, 
$metadata['sections'][$dbkey] );
+                               $treeBuilder = new 
\RemexHtml\TreeBuilder\TreeBuilder( $munger, [] );
+                               $dispatcher = new 
\RemexHtml\TreeBuilder\Dispatcher( $treeBuilder );
+                               $tokenizer = new 
\RemexHtml\Tokenizer\Tokenizer( $dispatcher, $html, [
+                                       // HTML comes from Parsoid so we can 
skip validation
+                                       'ignoreErrors' => true,
+                                       'ignoreCharRefs' => true,
+                                       'ignoreNulls' => true,
+                                       'skipPreprocess' => true,
+                               ] );
+                               $tokenizer->execute( [
+                                       'fragmentNamespace' => 
\RemexHtml\HTMLData::NS_HTML,
+                                       'fragmentName' => 'body',
+                               ] );
+                               $final .= Html::openElement( 'article' )
+                                                 . substr( 
$serializer->getResult(), 15 ) // strip "<!DOCTYPE html>"
+                                                 . Html::closeElement( 
'article' );
+                       }
+               }
+               $munger->endCollection();
+
+               $final = $this->renderCoverAndToc( $collection, $metadata )
+                                . $final
+                                . $this->renderContributors( $metadata );
+               return $final;
+       }
+
+       /**
+        * Generate HTML for book cover page and table of contents.
+        * @param array $collection Collection, as returned by 
CollectionSession::getCollection().
+        * @param array[] $metadata Map of prefixed DB key => metadata, as 
returned by fetchMetadata().
+        * @return string HTML to prepend to the book.
+        */
+       private function renderCoverAndToc( $collection, $metadata ) {
+               $cover = '';
+               $toc = '';
+
+               $title = $collection['title'];
+               $subtitle = $collection['subtitle'];
+               if ( $title ) {
+                       $cover = Html::element( 'h1', [], $title ) . "\n";
+                       if ( $subtitle ) {
+                               $cover .= Html::element( 'h2', [], $subtitle ) 
. "\n";
+                       }
+                       $cover = Html::rawElement( 'header', [], $cover );
+               }
+
+               $outline = [];
+               foreach ( $collection['items'] as $item ) {
+                       if ( $item['type'] === 'chapter' ) {
+                               $outline[] = [
+                                       'text' => $item['title'],
+                                       'type' => 'chapter',
+                                       'anchor' => 'mw-book-chapter-' . 
Sanitizer::escapeId( $item['title'] ),
+                               ];
+                       } elseif( $item['type'] === 'article' ) {
+                               $title = Title::newFromText( $item['title'] );
+                               $dbkey = $title->getPrefixedDBkey();
+                               $outline[] = [
+                                       'text' => 
$metadata['displaytitle'][$dbkey],
+                                       'type' => 'article',
+                                       'anchor' => 'mw-book-article-' . $dbkey,
+                               ];
+                               foreach ( $metadata['sections'][$dbkey] as 
$section ) {
+                                       $outline[] = [
+                                               'text' => $section['title'],
+                                               'type' => 'section',
+                                               'level' => $section['level'],
+                                               'anchor' => $section['id'],
+                                       ];
+                               }
+                       } else {
+                               throw new LogicException( 'Unknown collection 
item type: ' . $item['type'] );
+                       }
+               }
+               $outline[] = [
+                       'text' => 'Contributors',
+                       'type' => 'contributors',
+                       'anchor' => 'mw-book-contributors',
+               ];
+
+               $tocItems = array_map( function ( $item ) {
+                       $class = [ 'mw-book-tocitem-type-' . $item['type'] ];
+                       if ( $item['type'] === 'section' ) {
+                               $class[] = 'level-' . $item['level'];
+                       }
+                       return Html::rawElement( 'li', [ 'class' => $class ],
+                               Html::element( 'a', [ 'href' => '#' . 
$item['anchor'] ], $item['text'] ) );
+               }, $outline );
+               $toc = Html::rawElement( 'nav', [ 'class' => 'toc' ],
+                       Html::element( 'h1', [], 'Table of Contents' ) . "\n"
+                       . Html::rawElement( 'ul', [], implode( "\n", $tocItems 
) ) );
+
+               return $cover . $toc;
+       }
+
+       /**
+        * Generate HTML for the list of contributors.
+        * @param array[] $metadata Map of prefixed DB key => metadata, as 
returned by fetchMetadata().
+        * @return string HTML to append to the book.
+        */
+       private function renderContributors( $metadata ) {
+               $list = array_map( function ( $name ) {
+                       return Html::element( 'li', [], $name );
+               }, array_keys( $metadata['contributors'] ) );
+
+               return Html::element( 'h1', [ 'id' => 'mw-book-contributors' ], 
'Contributors' )
+                          . Html::rawElement( 'div', [ 'class' => 
'contributors' ],
+                               Html::rawElement( 'ul', [], implode( "\n", 
$list ) ) );
+       }
+
+       /**
+        * Get the part inside the <body> from an HTML file.
+        * Not very robust (a <body> tag in a comment or CDATA section could 
confuse it) but the
+        * <head> section has no user-controlled part so using it with Parsoid 
HTML should be fine.
+        * @param string $html
+        * @return string
+        */
+       private function getBodyContents( $html ) {
+               return preg_replace( 
'/(^.*?<body\b[^>]*>)|(<\/body>\s*<\/html>\s*$)/si', '', $html );
+       }
+
+}
diff --git a/includes/DataProvider.php b/includes/DataProvider.php
new file mode 100644
index 0000000..d303e68
--- /dev/null
+++ b/includes/DataProvider.php
@@ -0,0 +1,122 @@
+<?php
+
+namespace MediaWiki\Extensions\Collection;
+
+use ApiMain;
+use DerivativeRequest;
+use MediaWiki\Logger\LoggerFactory;
+use MultiHttpClient;
+use RequestContext;
+use Title;
+
+/**
+ * Given a set of titles, fetches article content and various metadata like 
authors.
+ */
+class DataProvider {
+
+       /**
+        * Fetch HTML for the pages in a collection.
+        * @param array[] $collection Collection, as returned by 
CollectionSession::getCollection().
+        * @return string[] Map of prefixed DB key => Parsoid HTML.
+        */
+       public function fetchPages( $collection ) {
+               $pages = [];
+               $reqs = [];
+               foreach ( $collection['items'] as $item ) {
+                       if ( $item['type'] === 'article' ) {
+                               // FIXME use VirtualRESTServiceClient
+                               $title = Title::newFromText( $item['title'] );
+                               $parsoidBaseURI = 
'https://en.wikipedia.org/api/rest_v1/page/html/';
+                               $url = $parsoidBaseURI . 
$title->getPrefixedDBkey();
+                               if ( isset( $item['revision'] ) ) {
+                                       // $url .= '/' . $item['revision'];
+                               }
+                               $reqs[] = [ 'method' => 'GET', 'url' => $url ];
+                               $pages[] = $title->getPrefixedDBkey();
+                       }
+               }
+               $client = new MultiHttpClient( [ 'logger' => 
LoggerFactory::getInstance( 'multihttp' ) ] )      ;
+               $resp = $client->runMulti( $reqs );
+               return array_combine( $pages, array_map( function ( $item ) {
+                       return !empty( $item['response']['error'] ) ? '' : 
$item['response']['body'];
+               }, $resp ) );
+       }
+
+       /**
+        * Fetch metadata (sections, contributors and RL modules) for the pages 
in a collection.
+        * @param string[] $dbkeys DB keys of the articles contained in the 
book.
+        * @return array
+        *   - displaytitle: [ dbkey => title, ... ]
+        *   - sections: [ dbkey => [ [ title => ..., id => ..., level => ... 
], ... ], ... ]
+        *   - contributors: [ name => userid, ... ]
+        *   - modules: [ module, ... ]
+        *   - modulescripts: [ module, ... ]
+        *   - modulestyles: [ module, ... ]
+        *   - jsconfigvars: [ var, ... ]
+        */
+       public function fetchMetadata( $dbkeys ) {
+               $metadata = [
+                       'displaytitle' => [],
+                       'sections' => [],
+                       'contributors' => [],
+                       'modules' => [],
+                       'modulescripts' => [],
+                       'modulestyles' => [],
+                       'jsconfigvars' => [],
+               ];
+
+               // get contributors
+               // FIXME probably better to use direct SQL queries
+               $params = [
+                       'format' => 'json',
+                       'action' => 'query',
+                       'prop' => 'contributors',
+                       'redirects' => 1,
+                       'pclimit' => 'max', // 500; more titles than that will 
probably blow up Electron anyway
+                       'titles' => implode( '|', $dbkeys ),
+               ];
+               $request = RequestContext::getMain()->getRequest();
+               do {
+                       $api = new ApiMain( new DerivativeRequest( $request, 
$params ) );
+                       $api->execute();
+                       $data = $api->getResult()->getResultData( [], [ 'Strip' 
=> 'all' ] );
+                       $continue = isset( $metadata['continue'] ) ? 
$metadata['continue'] : [];
+                       $params = $continue + $params;
+                       foreach ( $data['query']['pages'] as $page ) {
+                               foreach ( $page['contributors'] as $key => 
$contrib ) {
+                                       
$metadata['contributors'][$contrib['name']] = $contrib['userid'];
+                               }
+                       }
+               } while ( $continue );
+
+               // get sections & modules
+               // TODO collect sections from MCS and ResourceLoader modules 
from Parsoid once T69540 is resolved?
+               foreach ( $dbkeys as $dbkey ) {
+                       $params = [
+                               'format' => 'json',
+                               'action' => 'parse',
+                               'prop' => 
'sections|displaytitle|modules|jsconfigvars',
+                               'page' => $dbkey,
+                       ];
+                       $api = new ApiMain( new DerivativeRequest( $request, 
$params ) );
+                       $api->execute();
+                       $data = $api->getResult()->getResultData( [], [ 'Strip' 
=> 'all' ] );
+                       $metadata['displaytitle'][$dbkey] = 
$data['parse']['displaytitle'];
+                       $metadata['sections'][$dbkey] = array_map( function ( 
$sectionData ) {
+                               return [
+                                       'title' => $sectionData['line'],
+                                       'id' => $sectionData['anchor'],
+                                       'level' => $sectionData['level'],
+                               ];
+                       }, $data['parse']['sections'] );
+                       foreach ( [ 'modules', 'modulescripts', 'modulestyles', 
'jsconfigvars' ] as $field ) {
+                               // let's hope there is no conflict in 
jsconfigvars...
+                               $metadata[$field] = array_merge( 
$metadata[$field], $data['parse'][$field] );
+
+                       }
+               }
+
+               return $metadata;
+       }
+
+}
diff --git a/includes/RemexCollectionMunger.php 
b/includes/RemexCollectionMunger.php
new file mode 100644
index 0000000..30d819a
--- /dev/null
+++ b/includes/RemexCollectionMunger.php
@@ -0,0 +1,335 @@
+<?php
+
+namespace MediaWiki\Extensions\Collection;
+
+use RemexHtml\Serializer\Serializer;
+use RemexHtml\Tokenizer\Attributes;
+use RemexHtml\TreeBuilder\Element;
+use RemexHtml\TreeBuilder\TreeHandler;
+
+/**
+ * DOM tree munger for RemexHtml that makes small adjustments to a HTML 
document for including
+ * in a collection (a HTML document that's more or less the concatenation of 
multiple original
+ * documents).
+ *
+ * The munger is reused for parsing multiple documents and outputs a single 
unified document.
+ * It makes small changes to make the resulting document valid and look good:
+ * - converts h1 to h2 while preserving heading structure
+ * - removes the document name from before self-references
+ * - renames conflicting ids
+ */
+class RemexCollectionMunger implements TreeHandler {
+
+       /**
+        * @var array
+        */
+       private $options;
+
+       /**
+        * Map from original document ID to collection document ID.
+        * A value of false means that the ID is reserved and upon encountering 
it a new mapping
+        * to a free id needs to be created. A value of true means the ID is 
used (ie. not reserved
+        * but will have to be in the next document).
+        * @var array
+        */
+       private $idMap = [];
+
+       /**
+        * Reference to section data. id and level will be updated to keep in 
sync with document changes.
+        * @var array[] [[ title => ..., id => ..., level => ... ], ...]
+        */
+       private $sectionRef;
+
+       /**
+        * 1-based index for the current source document in the list of source 
documents.
+        * @var int
+        */
+       private $documentIndex = 0;
+
+       /**
+        * URL for the current document, relative to its base URL. For a 
Parsoid document this will
+        * be something like './Title'.
+        * @var string
+        */
+       private $selfLink;
+
+       /**
+        * Tracks how many levels headings need to be moved. E.g. a document 
with h1;h2;h3
+        * needs to be transformed to h2;h3;h4 while a document with h2;h3;h1 
to h2;h3;h2
+        * so we set $headingDisplacementLevel when encountering h1 and use it 
decide what to
+        * do with other headings.
+        * @var int
+        */
+       private $headingDisplacementLevel = 0;
+
+       /**
+        * Source document end position.
+        * @var int
+        */
+       private $endPos;
+
+       /** @var Serializer */
+       private $serializer;
+
+       /**
+        * @param Serializer $serializer
+        * @param array $options
+        *   - topHeadingLevel: highest allowed heading level (e.g. '2' means 
h1 is disallowed and will
+        *     be "pushed down")
+        */
+       public function __construct( Serializer $serializer, $options = [] ) {
+               $this->serializer = $serializer;
+               $this->options = $options + [
+                       'topHeadingLevel' => 2,
+               ];
+       }
+
+       /**
+        * Reset internal state. Needs to be called before parsing a new source 
document.
+        * @param string $selfLink URL prefix before # which means this is a 
local URL
+        * @param array[] $sections Section data; each section is a triple
+        *   [ title => ..., id => ..., level => ... ]. RemexCollectionMunger 
will update the id/level
+        *   to keep in sync with document changes.
+        */
+       public function startCollectionSection( $selfLink, &$sections ) {
+               $this->documentIndex++;
+               $this->headingDisplacementLevel = 0;
+               // set all mappings to false: they are only valid within a 
single source document
+               $this->idMap = array_fill_keys( array_keys( $this->idMap ), 
false );
+               $this->sectionRef = &$sections;
+               $this->selfLink = $selfLink;
+       }
+
+       /**
+        * Called by RemexHTML when parsing of a source document starts.
+        * @inheritdoc
+        */
+       public function startDocument( $fragmentNamespace, $fragmentName ) {
+               // This will emit a doctype even if fragment name is set. It 
needs to be
+               // removed manually after getting the result from the Formatter.
+               $this->serializer->startDocument( $fragmentNamespace, 
$fragmentName );
+       }
+
+       /**
+        * Called by RemexHTML when parsing stops.
+        * @param integer $pos The input string length, i.e. the past-the-end 
position.
+        */
+       public function endDocument( $pos ) {
+               $this->endPos = $pos;
+               $this->serializer->endDocument( $this->getPosition( $pos ) );
+               // do nothing - this is not necessarily the end of the output 
document.
+       }
+
+       /**
+        * Called by RemexHTML when parsing characters.
+        * @inheritdoc
+        */
+       public function characters(
+               $preposition, $ref, $text, $start, $length, $sourceStart, 
$sourceLength
+       ) {
+               $this->serializer->characters( $preposition, $ref, $text, 
$start, $length,
+                       $this->getPosition( $sourceStart ), $sourceLength );
+       }
+
+       /**
+        * Called by RemexHTML when parsing an element.
+        * @inheritdoc
+        */
+       function insertElement(
+               $preposition, $ref, Element $element, $void, $sourceStart, 
$sourceLength
+       ) {
+               // if the serializer has already seen this element, we already 
munged it
+               if ( !$element->userData ) {
+                       $this->fixHeading( $element );
+                       $this->fixId( $element->attrs, $element );
+               }
+               $this->serializer->insertElement( $preposition, $ref, $element, 
$void,
+                       $this->getPosition( $sourceStart ), $sourceLength );
+       }
+
+       /**
+        * Called by RemexHTML when parsing an end tag.
+        * @inheritdoc
+        */
+       function endTag( Element $element, $sourceStart, $sourceLength ) {
+               $this->serializer->endTag( $element, $this->getPosition( 
$sourceStart ), $sourceLength );
+       }
+
+       /**
+        * Called by RemexHTML when parsing a doctype declaration.
+        * @inheritdoc
+        */
+       function doctype( $name, $public, $system, $quirks, $sourceStart, 
$sourceLength ) {
+               // we only need the body so no point in forwarding this
+       }
+
+       /**
+        * Called by RemexHTML when parsing a comment.
+        * @inheritdoc
+        */
+       function comment( $preposition, $ref, $text, $sourceStart, 
$sourceLength ) {
+               $this->serializer->comment( $preposition, $ref, $text,
+                       $this->getPosition( $sourceStart), $sourceLength );
+       }
+
+       /**
+        * Called by RemexHTML on parse errors.
+        * @inheritdoc
+        */
+       function error( $text, $pos ) {
+               $this->serializer->error( $text, $this->getPosition( $pos ) );
+       }
+
+       /**
+        * Called by RemexHTML when updating element attributes.
+        * @inheritdoc
+        */
+       function mergeAttributes( Element $element, Attributes $attrs, 
$sourceStart ) {
+               // RemexHTML should only call this method for <html> and <body> 
which we discard
+               // so there is probably no need to fix ids but do it anyway 
just in case
+               $this->fixId( $attrs, $element );
+               $this->serializer->mergeAttributes( $element, $attrs, 
$this->getPosition( $sourceStart ) );
+       }
+
+       /**
+        * Called by RemexHTML in some edge cases when fixing invalid HTML.
+        * @inheritdoc
+        */
+       function removeNode( Element $element, $sourceStart ) {
+               $this->serializer->removeNode( $element, $this->getPosition( 
$sourceStart ) );
+       }
+
+       /**
+        * Called by RemexHTML in some edge cases when fixing invalid HTML.
+        * @inheritdoc
+        */
+       function reparentChildren( Element $element, Element $newParent, 
$sourceStart ) {
+               $this->serializer->reparentChildren( $element, $newParent, 
$this->getPosition( $sourceStart ) );
+       }
+
+       /**
+        * Translate a position in one of the source documents to a position in 
the document collection.
+        * This is only used for debugging so we just generate a number which 
makes it obvious where
+        * to look in the source documents.
+        * @param $originalSourceStart
+        * @return int
+        */
+       private function getPosition( $originalSourceStart ) {
+               // "concatenate" document index and position within document.
+               // this leaves ~100MB index space for each document which is 
plenty, and still fits
+               // comfortably into an int even on 32-bit builds.
+               return $this->documentIndex * 1e8 + $originalSourceStart;
+       }
+
+       /**
+        * Fix $element if it is a heading with the wrong level.
+        * h1 and maybe h2 are reserved for chapter/article titles, if we 
encounter any,
+        * force the whole heading structure to be on a lower level.
+        * @param Element $element
+        */
+       private function fixHeading( $element ) {
+               if ( !$this->isHeading( $element ) ) {
+                       return;
+               }
+
+               $level = (int)substr( $element->htmlName, 1 );
+               $displace = max( $this->headingDisplacementLevel, 
$this->options['topHeadingLevel'] - $level );
+               $this->headingDisplacementLevel = $displace;
+               $newLevel = min( $level + $displace, 6 );
+               if ( $newLevel !== $level ) {
+                       // update section data
+                       if ( isset( $element->attrs['id'] ) ) {
+                               foreach ( $this->sectionRef as $index => 
$section ) {
+                                       if ( $section['id'] === 
$element->attrs['id'] ) {
+                                               
$this->sectionRef[$index]['level'] = $newLevel;
+                                       }
+                               }
+                       }
+                       $element->name = $element->htmlName = 'h' . $newLevel;
+               }
+       }
+
+       /**
+        * Fix $element if it has or refers to an id which conflicts with an id 
in another document.
+        * Needed to prevent id conflicts (e.g. two documents using the same 
section name). Also fix
+        * Parsoid internal references to be #section, not ./Title#section.
+        * @param Attributes $attrs
+        * @param Element $element
+        */
+       private function fixId( $attrs, $element ) {
+               if ( isset( $attrs['id'] ) ) {
+                       $newId = $this->getUnreservedId( $attrs['id'] );
+                       if ( $newId !== $attrs['id'] ) {
+                               // if we renamed a heading anchor, update 
section data
+                               if ( $this->isHeading( $element ) ) {
+                                       foreach ( $this->sectionRef as $index 
=> $section ) {
+                                               if ( $section['id'] === 
$attrs['id'] ) {
+                                                       
$this->sectionRef[$index]['id'] = $newId;
+                                                       break;
+                                               }
+                                       }
+                               }
+                               $attrs['id'] = $newId;
+                       }
+               }
+               // Make sure local references are in sync with ids.
+               // We don't try to update cross-document references, too much 
effort.
+               if (
+                       $element->htmlName === 'a' && isset( $attrs['href'] )
+                       && $this->startsWith( $attrs['href'], $this->selfLink . 
'#' )
+               ) {
+                       $id = substr( $attrs['href'], strlen( $this->selfLink ) 
+ 1 );
+                       $id = $this->getUnreservedId( $id );
+                       $attrs['href'] = '#' . $id;
+               }
+       }
+
+       /**
+        * Get an unreserved id and update the mapping.
+        * Will return $id if it does not conflict with earlier documents; 
otherwise it will find
+        * a free name and use that instead, consistently.
+        * @param int $id
+        * @return string
+        */
+       private function getUnreservedId( $id ) {
+               if ( !isset( $this->idMap[$id] ) ) {
+                       // No conflict. Mark this id as being in use.
+                       $this->idMap[$id] = true;
+                       return $id;
+               } elseif ( $this->idMap[$id] === true ) {
+                       // This id has been used in the same source document. 
That's fine, nothing to do.
+                       return $id;
+               } elseif ( $this->idMap[$id] === false ) {
+                       // This id has been used in a different source 
document, must remap.
+                       $n = 2;
+                       do {
+                               $replacement = $id . '_' . $n++;
+                       } while ( isset( $this->idMap[$replacement] ) );
+                       $this->idMap[$id] = $replacement;
+                       $this->idMap[$replacement] = false;
+                       return $replacement;
+               } else {
+                       // This id has has already been remapped for the 
current source document.
+                       return $this->idMap[$id];
+               }
+       }
+
+       /**
+        * Is $element a HTML heading (h1..h6) tag?
+        * @param Element $element
+        * @return bool
+        */
+       private function isHeading( $element ) {
+               return in_array( $element->htmlName, [ 'h1', 'h2', 'h3', 'h4', 
'h5', 'h6' ], true );
+       }
+
+       /**
+        * Check for prefix match.
+        * @param string $haystack
+        * @param string $needle
+        * @return bool
+        */
+       private function startsWith( $haystack, $needle ) {
+               return substr_compare( $haystack, $needle, 0, strlen( $needle ) 
) === 0;
+       }
+}
diff --git a/modules/offline.css b/modules/offline.css
new file mode 100644
index 0000000..2ee9bbf
--- /dev/null
+++ b/modules/offline.css
@@ -0,0 +1,22 @@
+.mw-book-tocitem-type-chapter {
+    font-size: 140%;
+}
+.mw-book-tocitem-type-article {
+    font-size: 120%;
+    padding-left: 5px;
+}
+.mw-book-tocitem-type-section.level-2 {
+    padding-left: 10px;
+}
+.mw-book-tocitem-type-section.level-3 {
+    padding-left: 15px;
+}
+.mw-book-tocitem-type-section.level-4 {
+    padding-left: 20px;
+}
+.mw-book-tocitem-type-section.level-5 {
+    padding-left: 25px;
+}
+.mw-book-tocitem-type-section.level-6 {
+    padding-left: 30px;
+}

-- 
To view, visit https://gerrit.wikimedia.org/r/361453
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I686736e0b2c9f98a7c37046336219a0f852179b0
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Collection
Gerrit-Branch: master
Gerrit-Owner: Gergő Tisza <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] mediawiki...Collection[master]: [WIP] Concatenate pages and send to Electron

Reply via email to