Gergő Tisza has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/364137 )

Change subject: [WIP] Add TOC and page numbers via PDF post-processing
......................................................................

[WIP] Add TOC and page numbers via PDF post-processing

Parse the PDF named destination dictionary to find which page each
section ID appears on, then parse page annotations to find TOC item
positions and add the page number to each. Also add page numbers
to the bottom of each page.

Bug: T168871
Change-Id: Idacd765465ef86029baee502795f91f91b650dd6
---
M Collection.hooks.php
M Collection.php
M SpecialRenderBook.php
M composer.json
M includes/BookRenderer.php
A includes/PdfPostProcessor.php
6 files changed, 267 insertions(+), 25 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Collection 
refs/changes/37/364137/1

diff --git a/Collection.hooks.php b/Collection.hooks.php
index 797e87f..41b5b6f 100644
--- a/Collection.hooks.php
+++ b/Collection.hooks.php
@@ -21,13 +21,19 @@
  */
 
 class CollectionHooks {
-       public static function onSetup() {
+       public static function onRegistration() {
                // This prevents this extension from being run in environments
                // that don't have the dependent code in core; this should be
                // updated as a part of when additional dependencies are
                // created and pushed into MediaWiki's core.  In particular
                // this tracks the use of OOjs UI in the Collection extension.
                wfUseMW( '1.25wmf19' );
+
+               // install Composer autoloader
+               $extensionRoot =  __DIR__;
+               if ( file_exists( $extensionRoot . '/vendor/autoload.php' ) ) {
+                       require_once $extensionRoot . '/vendor/autoload.php';
+               }
        }
 
        /**
diff --git a/Collection.php b/Collection.php
index adee851..0bb8cd8 100644
--- a/Collection.php
+++ b/Collection.php
@@ -186,6 +186,8 @@
        = __DIR__ . '/includes/BookRenderer.php';
 
$wgAutoloadClasses[MediaWiki\Extensions\Collection\RemexCollectionMunger::class]
        = __DIR__ . '/includes/RemexCollectionMunger.php';
+$wgAutoloadClasses[\MediaWiki\Extensions\Collection\PdfPostProcessor::class]
+       = __DIR__ . '/includes/PdfPostProcessor.php';
 
 $wgAutoloadClasses['CollectionPageTemplate'] = __DIR__ . 
'/templates/CollectionPageTemplate.php';
 $wgAutoloadClasses['CollectionListTemplate'] = __DIR__ . 
'/templates/CollectionListTemplate.php';
@@ -218,7 +220,6 @@
 $wgHooks['SidebarBeforeOutput'][] = 'CollectionHooks::buildSidebar';
 $wgHooks['SiteNoticeAfter'][] = 'CollectionHooks::siteNoticeAfter';
 $wgHooks['OutputPageCheckLastModified'][] = 
'CollectionHooks::checkLastModified';
-$wgExtensionFunctions[] = 'CollectionHooks::onSetup';
 
 $wgAvailableRights[] = 'collectionsaveasuserpage';
 $wgAvailableRights[] = 'collectionsaveascommunitypage';
@@ -281,6 +282,8 @@
 
 $wgCollectionVivlioHtml = __DIR__ . 
'/vivliostyle-viewer/vivliostyle-viewer.html';
 
+CollectionHooks::onRegistration();
+
 # register global Ajax functions:
 
 function wfAjaxGetCollection() {
diff --git a/SpecialRenderBook.php b/SpecialRenderBook.php
index 2002533..1d75c50 100644
--- a/SpecialRenderBook.php
+++ b/SpecialRenderBook.php
@@ -2,6 +2,7 @@
 
 use MediaWiki\Extensions\Collection\BookRenderer;
 use MediaWiki\Extensions\Collection\DataProvider;
+use MediaWiki\Extensions\Collection\PdfPostProcessor;
 use MediaWiki\MediaWikiServices;
 
 function d($x) {
@@ -60,21 +61,27 @@
 
                        case 'electron':
                        case 'electron-vivlio':
-                               // FIXME should just redirect to /raw instead 
and make that cacheable but
-                               // need non-session-based storage for that
                                $book = $this->getBook();
-
-                               $mode = ( $subPage === 'electron-vivlio' ) ? 
'vivlio' : 'skinned';
-                               $bookUrl = $this->getUrl( $mode, $book['key'] );
-                               // hack hack
-                               $electronUrl = 
'http://electron.local.wmftest.net:10241/pdf';
-                               $accessKey = 'secret';
-
-                               $url = $electronUrl . '?' . wfArrayToCgi( [
-                                       'accessKey' => $accessKey,
-                                       'url' => $bookUrl,
-                               ] );
+                               $url = $this->getUrl( $subPage, $book['key'] );
                                $out->redirect( $url );
+                               return;
+
+                       case 'postprocessed':
+                               $book = $this->getBook();
+                               $pdfUrl = $this->getUrl( 'electron', 
$book['key'] );
+                               $pdfContent = Http::get( $pdfUrl );
+
+                               $processor = new PdfPostProcessor();
+                               $newPdfContent = $processor->postProcessPdf( 
$pdfContent, $book['outline'] );
+
+                               $out->disable();
+                               ob_end_clean(); // just in case
+                               $response = 
$this->getContext()->getRequest()->response();
+                               $response->statusHeader( 200 );
+                               $response->header( 'Content-Type: 
application/pdf' );
+                               $response->header( "Content-Disposition: 
attachment; filename='book.pdf'" );
+                               $response->header( 'Content-Length: ' . strlen( 
$newPdfContent ) );
+                               echo $newPdfContent;
                                return;
 
                        default:
@@ -85,6 +92,7 @@
                                        'vivlio' => 'HTML, with vivliostyle 
viewer',
                                        'electron' => 'PDF, raw',
                                        'electron-vivlio' => 'PDF, with 
vivliojs',
+                                       'postprocessed' => 'PDF (raw) 
post-processed in PHP',
                                        'share' => 'shareable link to raw URL',
                                ];
                                $html = Html::openElement( 'ul', [] );
@@ -118,8 +126,23 @@
                        }
                        $bookUrl = $this->getPageTitle( 'share/' . $key 
)->getFullURL();
                        $baseUrl = $this->getConfig()->get( 
'ExtensionAssetsPath' ) . '/Collection';
-                       $url = $baseUrl . 
'/vendor/vivliostyle/viewer/viewer/vivliostyle-viewer.html#x=' . $bookUrl;
+                       $url =
+                               $baseUrl . 
'/vendor/vivliostyle/viewer/viewer/vivliostyle-viewer.html#x=' .
+                               $bookUrl;
                        return wfExpandUrl( $url );
+               } elseif ( $mode === 'electron' || $mode === 'electron-vivlio' 
) {
+                       if ( !$key ) {
+                               return false;
+                       }
+                       $mode = ( $mode === 'electron-vivlio' ) ? 'vivlio' : 
'skinned';
+                       $bookUrl = $this->getUrl( $mode, $key );
+                       // hack hack
+                       $electronUrl = 
'http://electron.local.wmftest.net:10241/pdf';
+                       $accessKey = 'secret';
+                       return $electronUrl . '?' . wfArrayToCgi( [
+                               'accessKey' => $accessKey,
+                               'url' => $bookUrl,
+                       ] );
                } else {
                        return $this->getPageTitle( $mode )->getFullURL( [ 
'key' => $key ] );
                }
@@ -129,9 +152,12 @@
        /**
         * Get the HTML source for the book, from cache or by rendering it.
         * @param string $key Cache key
-        * @return string[] associative array with:
+        * @return array Associative array with:
         *    - html: HTML, not including <body> or anything outside that.
         *    - key: WAN cache key for the HTML
+        *    - sections: sections in the book (see DataProvider::fetchMetadata 
for format)
+        *    - outline: outline of the book (see 
BookRenderer::renderCoverAndToc for format)
+        *    - displaytitle: array of dbkey => title HTML
         *    - modules, modulescripts, modulestyles, jsconfigvars: 
ResourceLoader data
         */
        private function getBook( $key = null ) {
@@ -155,6 +181,9 @@
 
                $book = [
                        'html' => $html,
+                       'sections' => $metadata['sections'],
+                       'outline' => $metadata['outline'],
+                       'displaytitle' => $metadata['displaytitle'],
                        'modules' => $metadata['modules'],
                        'modulescripts' => $metadata['modulescripts'],
                        'modulestyles' => $metadata['modulestyles'],
diff --git a/composer.json b/composer.json
index 18ebcc1..1490595 100644
--- a/composer.json
+++ b/composer.json
@@ -7,7 +7,9 @@
 
        "require": {
                "composer/installers" : ">=1.0.1",
-               "vivliostyle/viewer": "2017.6"
+               "vivliostyle/viewer": "2017.6",
+               "smalot/pdfparser": "0.10.0",
+               "zendframework/zendpdf": "~2.0"
        },
        "config": {
                "prepend-autoloader": false,
diff --git a/includes/BookRenderer.php b/includes/BookRenderer.php
index affb83b..8f867a4 100644
--- a/includes/BookRenderer.php
+++ b/includes/BookRenderer.php
@@ -18,9 +18,11 @@
         * @param array[] $collection Collection, as returned by 
CollectionSession::getCollection().
         * @param string[] $pages Map of prefixed DB key => Parsoid HTML.
         * @param array[] $metadata Map of prefixed DB key => metadata, as 
returned by fetchMetadata().
+        *   Section data will be updated to account for heading level and id 
changes.
+        *   Also, an outline will be added (see renderCoverAndToc() for 
format).
         * @return string HTML of the rendered book (without body/head).
         */
-       public function renderBook( $collection, $pages, $metadata ) {
+       public function renderBook( $collection, $pages, &$metadata ) {
                $hasChapters = (bool)array_filter( $collection['items'], 
function ( $item ) {
                        return $item['type'] === 'chapter';
                } );
@@ -81,11 +83,23 @@
         * Generate HTML for book cover page and table of contents.
         * @param array $collection Collection, as returned by 
CollectionSession::getCollection().
         * @param array[] $metadata Map of prefixed DB key => metadata, as 
returned by fetchMetadata().
+        *   An outline will be added which is similar to sections but flat and 
each item has the fields
+        *     - text: text of the outline item (article title, section title 
etc)
+        *     - type: 'chapter', 'article', 'section' or 'contributors'
+        *     - level: heading level or -2 for chapter, -1 for article
+        *     - anchor: id of the document node which the outline item refers 
to
         * @return string HTML to prepend to the book.
         */
-       private function renderCoverAndToc( $collection, $metadata ) {
+       private function renderCoverAndToc( $collection, &$metadata ) {
                $cover = '';
                $toc = '';
+
+               $hasChapters = (bool)array_filter( $collection['items'], 
function ( $item ) {
+                       return $item['type'] === 'chapter';
+               } );
+               $articleCount = count( array_filter( $collection['items'], 
function ( $item ) {
+                       return $item['type'] === 'article';
+               } ) );
 
                $title = $collection['title'];
                $subtitle = $collection['subtitle'];
@@ -103,16 +117,20 @@
                                $outline[] = [
                                        'text' => $item['title'],
                                        'type' => 'chapter',
+                                       'level' => -2,
                                        'anchor' => 'mw-book-chapter-' . 
Sanitizer::escapeId( $item['title'] ),
                                ];
                        } elseif( $item['type'] === 'article' ) {
                                $title = Title::newFromText( $item['title'] );
                                $dbkey = $title->getPrefixedDBkey();
-                               $outline[] = [
-                                       'text' => 
$metadata['displaytitle'][$dbkey],
-                                       'type' => 'article',
-                                       'anchor' => 'mw-book-article-' . $dbkey,
-                               ];
+                               if ( $articleCount > 1 ) {
+                                       $outline[] = [
+                                               'text' => 
$metadata['displaytitle'][$dbkey],
+                                               'type' => 'article',
+                                               'level' => -1,
+                                               'anchor' => 'mw-book-article-' 
. $dbkey,
+                                       ];
+                               }
                                foreach ( $metadata['sections'][$dbkey] as 
$section ) {
                                        $outline[] = [
                                                'text' => $section['title'],
@@ -125,11 +143,21 @@
                                throw new LogicException( 'Unknown collection 
item type: ' . $item['type'] );
                        }
                }
+
+               if ( $hasChapters ) {
+                       $contributorsLevel = -2;
+               } elseif ( $articleCount > 1 ) {
+                       $contributorsLevel = -1;
+               } else {
+                       $contributorsLevel = 0;
+               }
                $outline[] = [
                        'text' => 'Contributors',
                        'type' => 'contributors',
+                       'level' => $contributorsLevel,
                        'anchor' => 'mw-book-contributors',
                ];
+               $metadata['outline'] = $outline;
 
                $tocItems = array_map( function ( $item ) {
                        $class = [ 'mw-book-tocitem-type-' . $item['type'] ];
diff --git a/includes/PdfPostProcessor.php b/includes/PdfPostProcessor.php
new file mode 100644
index 0000000..e7a562f
--- /dev/null
+++ b/includes/PdfPostProcessor.php
@@ -0,0 +1,174 @@
+<?php
+
+namespace MediaWiki\Extensions\Collection;
+
+use ZendPdf\PdfDocument;
+
+/**
+ * Adds outline and a table of content with page numbers to a PDF file 
generated by Electron.
+ */
+class PdfPostProcessor {
+
+       /**
+        * Post-process PDF with page numbers and TOC numbers
+        * @param string $pdfContent Contents of the PDF file to post-process
+        * @param array[] $outline Sections in the book (see 
BookRenderer::renderCoverAndToc for format)
+        * @return string The new PDF file
+        */
+       public function postProcessPdf( $pdfContent, array $outline ) {
+               $this->findPageNumbers( $pdfContent, $outline );
+
+               $pdf = new \ZendPdf\PdfDocument( $pdfContent, null, false );
+               $this->addOutline( $pdf, $outline );
+               $this->addTocNumbers( $pdf, $outline );
+               $this->addPageNumbers( $pdf );
+               return $pdf->render();
+       }
+
+       /**
+        * Find page number for each outline item and add to $outline
+        * @param string $pdfContent Contents of the PDF file
+        * @param array[] $outline Outline that was passed to postProcessPdf()
+        */
+       private function findPageNumbers( $pdfContent, array &$outline ) {
+               // TODO use ZendPdf
+               $parser = new \Smalot\PdfParser\Parser();
+               $pdf = $parser->parseContent( $pdfContent );
+               $pages  = $pdf->getPages();
+
+               $lastObject = end( $pdf->getObjects() );
+               foreach ( $outline as $i => $item ) {
+                       $page = $lastObject->getHeader()->get( $item['anchor'] 
)->getContent()[0];
+                       $pageNumber = array_search( $page, $pages ) + 1; // 
1-based
+                       $outline[$i]['page'] = $pageNumber;
+               }
+       }
+
+       /**
+        * Add outline to a PDF document.
+        * @param PdfDocument $pdf
+        * @param array[] $outline Outline that was passed to postProcessPdf(), 
and
+        *   extended with page numbers by findPageNumbers()
+        */
+       private function addOutline( PdfDocument $pdf, array $outline ) {
+               $lastItems = []; // level => last item on that level
+               foreach ( $outline as $item ) {
+                       $level = $item['level'];
+                       $title = strip_tags( $item['text'] );
+
+                       // FIXME would be nicer with named destinations but 
neither of these work:
+                       // $target = \ZendPdf\Destination\Named::create( 
$item['anchor'] );
+                       // $target = new \ZendPdf\Destination\Named( new 
\ZendPdf\InternalType\NameObject( $item['anchor'] ) );
+                       $target = \ZendPdf\Destination\Fit::create( 
$item['page'] - 1 ); // 0-based
+                       $outlineElement = 
\ZendPdf\Outline\AbstractOutline::create( $title, $target );
+                       if ( in_array( $item['type'], [ 'chapter', 'article' ], 
true ) ) {
+                       $outlineElement->setIsOpen( true );
+                       }
+
+                       $lastItems = wfArrayFilterByKey( $lastItems, function ( 
$key ) use ( $level ) {
+                               return $key < $level;
+                       } );
+                       /** @var \ZendPdf\Outline\AbstractOutline 
$parentElement */
+                       $parentElement = end( $lastItems ) ?: null;
+                       $lastItems[$level] = $outlineElement;
+
+                       if ( $parentElement ) {
+                               $parentElement->childOutlines[] = 
$outlineElement;
+                       } else {
+                               $pdf->outlines[] = $outlineElement;
+                       }
+               }
+       }
+
+       /**
+        * Add page numbers to the table of content in a PDF document.
+        * @param PdfDocument $pdf
+        * @param array[] $outline Sections in the book (see 
DataProvider::fetchMetadata for format),
+        *   extended with page numbers by findPageNumbers()
+        */
+       private function addTocNumbers( PdfDocument $pdf, array $outline ) {
+               $tocIndex = [];
+               foreach ( $outline as $item ) {
+                       $tocIndex[$item['anchor']] = $item['page'];
+               }
+
+               $firstTocPage = 0;
+               $lastTocPage = $outline[0]['page'] - 1; // there is a page 
break before the first content page
+               foreach ( array_slice( $pdf->pages, $firstTocPage, $lastTocPage 
) as $page ) {
+                       /** @var \ZendPdf\Page $page */
+                       foreach ( $page->getPageDictionary()->Annots->items as 
$annot ) {
+                               /** @var \ZendPdf\InternalType\DictionaryObject 
$annot */
+
+                               // sanity checks
+                               if (
+                                       $annot->Subtype->value !== 'Link'
+                                       || $annot->Dest->getType() !== 
\ZendPdf\InternalType\AbstractTypeObject::TYPE_NAME
+                               ) {
+                                       continue;
+                               }
+                               $anchor = $annot->Dest->value;
+                               if ( !array_key_exists( $anchor, $tocIndex ) ) {
+                                       continue;
+                               }
+
+                               $pageNum = $tocIndex[$anchor];
+
+                               // get the bounding rectangle for the clickable 
area of the TOC item
+                               list( $left, $top, $right, $bottom ) = 
array_map( function ( $num ) {
+                                       /** @var 
\ZendPdf\InternalType\NumericObject $num */
+                                       return $num->value;
+                               }, iterator_to_array( $annot->Rect->items ) );
+                               // [left, top, right, bottom] is conventional 
but the PDF spec allows specifying
+                               // any two opposite cornets in an [x1, y2, x2, 
y2] format so let's be cautious
+                               if ( $left > $right ) {
+                                       list( $left, $right ) = [ $right, $left 
];
+                               }
+                               if ( $top < $bottom ) { // 0 is the bottom edge
+                                       list( $top, $bottom ) = [ $bottom, $top 
];
+                               }
+
+                               $font = new 
\ZendPdf\Resource\Font\Simple\Standard\TimesRoman();
+                               $fontSize = 10;
+                               $tocItemFontSize = 16;  // TODO sizes actually 
depend on outline item type
+                               $page->setFont( $font, $fontSize );
+
+                               // figure out X & Y offset compared to the top 
right corner of the link
+                               // annotation bounding box which makes it looks 
nice
+                               $width = $this->getStringWidth( $pageNum, 
$font, $fontSize );
+                               $maxWidth = $this->getStringWidth( '0000', 
$font, $fontSize );
+                               $xOffset = $maxWidth - $width;
+                               $yOffset = - $font->getAscent() / 
$font->getUnitsPerEm() * $tocItemFontSize;
+
+                               $page->drawText( $pageNum, $right + $xOffset, 
$top + $yOffset, 'UTF-8');
+                       }
+               }
+       }
+
+       /**
+        * Add page numbers to a PDF document.
+        * @param PdfDocument $pdf
+        */
+       private function addPageNumbers( PdfDocument $pdf ) {
+               $pageNum = 0;
+               foreach ( $pdf->pages as $page ) {
+                       ++$pageNum;
+                       /** @var \ZendPdf\Page $page */
+                       $page->setFont( new 
\ZendPdf\Resource\Font\Simple\Standard\TimesRoman(), 14 );
+                       // TODO base these on $page->getWidth/Height
+                       $page->drawText( $pageNum, $pageNum % 2 ? 40 : 540, 20, 
'UTF-8' );
+               }
+       }
+
+       /**
+        * Helper method to calculate the width of a string.
+        * @param string $str The string (must be ASCII).
+        * @param \ZendPdf\Resource\Font\AbstractFont $font
+        * @param int $fontSize Font size in points.
+        * @return float
+        */
+       private function getStringWidth( $str, $font, $fontSize ) {
+               $codePoints = array_map( 'ord', str_split( $str ) );
+               return array_sum( $font->widthsForGlyphs( 
$font->glyphNumbersForCharacters( $codePoints ) ) )
+                                / $font->getUnitsPerEm() * $fontSize;
+       }
+}

-- 
To view, visit https://gerrit.wikimedia.org/r/364137
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Idacd765465ef86029baee502795f91f91b650dd6
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Collection
Gerrit-Branch: master
Gerrit-Owner: GergÅ‘ Tisza <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to