Smalyshev has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/294403

Change subject: Cleanup code that has been moved.
......................................................................

Cleanup code that has been moved.

Change-Id: Id284d253629be9639cd1476a7d87c8bef3345a86
Depends-On: Ie45de496ecc826211d98eea3a410c7639b4be0a4
Bug: T89733
---
M autoload.php
D includes/BuildDocument/FileDataBuilder.php
D includes/BuildDocument/PageDataBuilder.php
D includes/BuildDocument/PageTextBuilder.php
M includes/Maintenance/MappingConfigBuilder.php
M includes/Updater.php
6 files changed, 4 insertions(+), 425 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/03/294403/1

diff --git a/autoload.php b/autoload.php
index d8d3ad5..2488bc9 100644
--- a/autoload.php
+++ b/autoload.php
@@ -12,11 +12,8 @@
        'CirrusSearch\\Api\\SettingsDump' => __DIR__ . 
'/includes/Api/SettingsDump.php',
        'CirrusSearch\\Api\\SuggestIndex' => __DIR__ . 
'/includes/Api/SuggestIndex.php',
        'CirrusSearch\\BuildDocument\\Builder' => __DIR__ . 
'/includes/BuildDocument/Builder.php',
-       'CirrusSearch\\BuildDocument\\FileDataBuilder' => __DIR__ . 
'/includes/BuildDocument/FileDataBuilder.php',
        'CirrusSearch\\BuildDocument\\IncomingLinksScoringMethod' => __DIR__ . 
'/includes/BuildDocument/SuggestScoring.php',
        'CirrusSearch\\BuildDocument\\PQScore' => __DIR__ . 
'/includes/BuildDocument/SuggestScoring.php',
-       'CirrusSearch\\BuildDocument\\PageDataBuilder' => __DIR__ . 
'/includes/BuildDocument/PageDataBuilder.php',
-       'CirrusSearch\\BuildDocument\\PageTextBuilder' => __DIR__ . 
'/includes/BuildDocument/PageTextBuilder.php',
        'CirrusSearch\\BuildDocument\\ParseBuilder' => __DIR__ . 
'/includes/BuildDocument/Builder.php',
        'CirrusSearch\\BuildDocument\\QualityScore' => __DIR__ . 
'/includes/BuildDocument/SuggestScoring.php',
        'CirrusSearch\\BuildDocument\\RedirectsAndIncomingLinks' => __DIR__ . 
'/includes/BuildDocument/RedirectsAndIncomingLinks.php',
diff --git a/includes/BuildDocument/FileDataBuilder.php 
b/includes/BuildDocument/FileDataBuilder.php
deleted file mode 100644
index 5db95a3..0000000
--- a/includes/BuildDocument/FileDataBuilder.php
+++ /dev/null
@@ -1,52 +0,0 @@
-<?php
-
-namespace CirrusSearch\BuildDocument;
-
-use LocalFile;
-
-/**
- * Add file metadata-type stuff to a document
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- * http://www.gnu.org/copyleft/gpl.html
- */
-
-class FileDataBuilder extends Builder {
-       /**
-        * @var LocalFile
-        */
-       private $file;
-
-       /**
-        * @return \Elastica\Document
-        */
-       public function build() {
-               $this->file = wfLocalFile( $this->title );
-               if ( $this->file && $this->file->exists() ) {
-                       $this->fileText();
-               }
-
-               return $this->doc;
-       }
-
-       private function fileText() {
-               if ( $this->file->getHandler() ) {
-                       $fileText = $this->file->getHandler()->getEntireText( 
$this->file );
-                       if ( $fileText ) {
-                               $this->doc->set( 'file_text', $fileText );
-                       }
-               }
-       }
-}
diff --git a/includes/BuildDocument/PageDataBuilder.php 
b/includes/BuildDocument/PageDataBuilder.php
deleted file mode 100644
index 04a1e3d..0000000
--- a/includes/BuildDocument/PageDataBuilder.php
+++ /dev/null
@@ -1,160 +0,0 @@
-<?php
-
-namespace CirrusSearch\BuildDocument;
-
-use Category;
-use Sanitizer;
-use Title;
-use CirrusSearch\Util;
-
-/**
- * Add everything to a page that doesn't require page text.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- * http://www.gnu.org/copyleft/gpl.html
- */
-
-class PageDataBuilder extends ParseBuilder {
-       /**
-        * @return \Elastica\Document
-        */
-       public function build() {
-               switch ( $this->content->getModel() ) {
-                       case CONTENT_MODEL_CSS:
-                       case CONTENT_MODEL_JAVASCRIPT:
-                               // Don't use parser output here. It's useless 
and leads
-                               // to weird results. Instead, clear everything. 
See bug 61752.
-                               $this->doc->set( 'category', array() );
-                               $this->doc->set( 'external_link', array() );
-                               $this->doc->set( 'heading', array() );
-                               $this->doc->set( 'outgoing_link', array() );
-                               $this->doc->set( 'template', array() );
-                               break;
-                       default:
-                               $this->categories();
-                               $this->externalLinks();
-                               $this->headings();
-                               $this->outgoingLinks();
-                               $this->templates();
-                               $this->setWikibaseItemId();
-               }
-
-               // All content types have a language
-               $this->doc->set( 'language',
-                       $this->title->getPageLanguage()->getCode() );
-
-               return $this->doc;
-       }
-
-       private function categories() {
-               $categories = array();
-               foreach ( array_keys( $this->parserOutput->getCategories() ) as 
$key ) {
-                       $categories[] = Category::newFromName( $key 
)->getTitle()->getText();
-               }
-               $this->doc->set( 'category', $categories );
-       }
-
-       private function externalLinks() {
-               $this->doc->set( 'external_link',
-                       array_keys( $this->parserOutput->getExternalLinks() )
-               );
-       }
-
-       private function outgoingLinks() {
-               $outgoingLinks = array();
-               foreach ( $this->parserOutput->getLinks() as $linkedNamespace 
=> $namespaceLinks ) {
-                       foreach ( array_keys( $namespaceLinks ) as $linkedDbKey 
) {
-                               $outgoingLinks[] =
-                                       Title::makeTitle( $linkedNamespace, 
$linkedDbKey )->getPrefixedDBkey();
-                       }
-               }
-               $this->doc->set( 'outgoing_link', $outgoingLinks );
-       }
-
-       private function templates() {
-               $templates = array();
-               foreach ( $this->parserOutput->getTemplates() as $tNS => 
$templatesInNS ) {
-                       foreach ( array_keys( $templatesInNS ) as $tDbKey ) {
-                               $templateTitle = Title::makeTitleSafe( $tNS, 
$tDbKey );
-                               if ( $templateTitle && $templateTitle->exists() 
) {
-                                       $templates[] = 
$templateTitle->getPrefixedText();
-                               }
-                       }
-               }
-               $this->doc->set( 'template', $templates );
-       }
-
-       private function headings() {
-               $headings = array();
-               $ignoredHeadings = $this->getIgnoredHeadings();
-               foreach ( $this->parserOutput->getSections() as $heading ) {
-                       $heading = $heading[ 'line' ];
-                       // First strip out things that look like references.  
We can't use HTML filtering because
-                       // the references come back as <sup> tags without a 
class.  To keep from breaking stuff like
-                       //  ==Applicability of the strict mass–energy 
equivalence formula, ''E'' = ''mc''<sup>2</sup>==
-                       // we don't remove the whole <sup> tag.  We also don't 
want to strip the <sup> tag and remove
-                       // everything that looks like [2] because, I dunno, 
maybe there is a band named Word [2] Foo
-                       // or something.  Whatever.  So we only strip things 
that look like <sup> tags wrapping a
-                       // reference.  And since the data looks like:
-                       //      Reference in heading 
<sup>&#91;1&#93;</sup><sup>&#91;2&#93;</sup>
-                       // we can not really use HtmlFormatter as we have no 
suitable selector.
-
-                       // Some wikis wrap the brackets in a span:
-                       // 
http://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
-                       $heading = preg_replace( '/<\/?span>/', '', $heading );
-                       // Normalize [] so the following regexp would work.
-                       $heading = preg_replace( [ '/&#91;/', '/&#93;/' ], [ 
'[', ']' ], $heading );
-                       $heading = preg_replace( 
'/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/is', '', $heading );
-
-                       // Strip tags from the heading or else we'll display 
them (escaped) in search results
-                       $heading = trim( Sanitizer::stripAllTags( $heading ) );
-
-                       // Note that we don't take the level of the heading 
into account - all headings are equal.
-                       // Except the ones we ignore.
-                       if ( !in_array( $heading, $ignoredHeadings ) ) {
-                               $headings[] = $heading;
-                       }
-               }
-               $this->doc->set( 'heading', $headings );
-       }
-
-       /**
-        * @return string[]
-        */
-       private function getIgnoredHeadings() {
-               static $ignoredHeadings = null;
-               if ( $ignoredHeadings === null ) {
-                       $source = wfMessage( 'cirrussearch-ignored-headings' 
)->inContentLanguage();
-                       $ignoredHeadings = array();
-                       if( !$source->isDisabled() ) {
-                               $lines = Util::parseSettingsInMessage( 
$source->plain() );
-                               $ignoredHeadings = $lines;               // Now 
we just have headings!
-                       }
-               }
-               return $ignoredHeadings;
-       }
-
-       /**
-        * Add item id of a page's connected Wikibase item (if available)
-        *
-        * @todo move this to Wikibase
-        */
-       private function setWikibaseItemId() {
-               $wikibaseItem = $this->parserOutput->getProperty( 
'wikibase_item' );
-               if ( $wikibaseItem !== false ) {
-                       $this->doc->set( 'wikibase_item', $wikibaseItem );
-               }
-       }
-}
diff --git a/includes/BuildDocument/PageTextBuilder.php 
b/includes/BuildDocument/PageTextBuilder.php
deleted file mode 100644
index 424161c..0000000
--- a/includes/BuildDocument/PageTextBuilder.php
+++ /dev/null
@@ -1,175 +0,0 @@
-<?php
-
-namespace CirrusSearch\BuildDocument;
-
-use Content;
-use HtmlFormatter\HtmlFormatter;
-use MediaWiki\Logger\LoggerFactory;
-use ParserOutput;
-use Sanitizer;
-
-/**
- * Adds fields to the document that require article text.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- * http://www.gnu.org/copyleft/gpl.html
- */
-class PageTextBuilder extends ParseBuilder {
-       /**
-        * @var string[] selectors to elements that are excluded entirely from 
search
-        */
-       private $excludedElementSelectors = array(
-               'audio', 'video',       // "it looks like you don't have 
javascript enabled..." do not need to index
-               'sup.reference',        // The [1] for references
-               '.mw-cite-backlink',    // The ↑ next to references in the 
references section
-               'h1', 'h2', 'h3',       // Headings are already indexed in 
their own field.
-               'h5', 'h6', 'h4',
-               '.autocollapse',        // Collapsed fields are hidden by 
default so we don't want them showing up.
-       );
-       /**
-        * @var string[] selectors to elements that are considered auxiliary to 
article text for search
-        */
-       private $auxiliaryElementSelectors = array(
-               '.thumbcaption',        // Thumbnail captions aren't really 
part of the text proper
-               'table',                // Neither are tables
-               '.rellink',             // Common style for "See also:".
-               '.dablink',             // Common style for calling out helpful 
links at the top of the article.
-               '.searchaux',           // New class users can use to mark 
stuff as auxiliary to searches.
-       );
-
-       /**
-        * @param \Elastica\Document $doc
-        * @param Content $content
-        * @param ParserOutput $parserOutput
-        */
-       public function __construct( \Elastica\Document $doc, Content $content, 
ParserOutput $parserOutput ) {
-               parent::__construct( $doc, null, $content, $parserOutput );
-       }
-
-       /**
-        * @return \Elastica\Document
-        */
-       public function build() {
-               list( $text, $opening, $auxiliary ) = $this->buildTextToIndex();
-               $this->doc->set( 'text', $text );
-               $this->doc->set( 'opening_text', $opening );
-               $this->doc->set( 'auxiliary_text', $auxiliary );
-               $this->doc->set( 'text_bytes', $this->content->getSize() );
-               $this->doc->set( 'source_text', 
$this->content->getTextForSearchIndex() );
-
-               return $this->doc;
-       }
-
-       /**
-        * Fetch text to index. If $content is wikitext then render and strip 
things from it.
-        * Otherwise delegate to the $content itself.
-        *
-        * @return array Three tuple of (text, opening, auxiliary). Text is 
always string. Opening
-        *  is string or null. Auxiliary is string[].
-        */
-       private function buildTextToIndex() {
-               switch ( $this->content->getModel() ) {
-                       case CONTENT_MODEL_WIKITEXT:
-                               return $this->formatWikitext( 
$this->parserOutput );
-                       default:
-                               $text = $this->content->getTextForSearchIndex();
-                               return array( $text, null, array() );
-               }
-       }
-
-       /**
-        * Get text to index from a ParserOutput assuming the content was 
wikitext.
-        *
-        * @param ParserOutput $parserOutput The parsed wikitext's parser output
-        * @return array who's first entry is text and second is opening text, 
and third is an
-        *  array of auxiliary text
-        */
-       private function formatWikitext( ParserOutput $parserOutput ) {
-               global $wgCirrusSearchBoostOpening;
-
-               $parserOutput->setEditSectionTokens( false );
-               $parserOutput->setTOCEnabled( false );
-               $text = $parserOutput->getText();
-               $opening = null;
-
-               switch ( $wgCirrusSearchBoostOpening ) {
-               case 'first_heading':
-                       $opening = $this->extractHeadingBeforeFirstHeading( 
$text );
-                       break;
-               case 'none':
-                       break;
-               default:
-                       LoggerFactory::getInstance( 'CirrusSearch' )->warning(
-                               "Invalid value for 
\$wgCirrusSearchBoostOpening: {wgCirrusSearchBoostOpening}",
-                               array( 'wgCirrusSearchBoostOpening' =>  
$wgCirrusSearchBoostOpening )
-                       );
-               }
-
-               // Add extra spacing around break tags so text crammed together 
like<br>this doesn't make one word.
-               $text = str_replace( '<br', "\n<br", $text );
-
-               $formatter = new HtmlFormatter( $text );
-
-               // Strip elements from the page that we never want in the 
search text.
-               $formatter->remove( $this->excludedElementSelectors );
-               $formatter->filterContent();
-
-               // Strip elements from the page that are auxiliary text.  These 
will still be
-               // searched but matches will be ranked lower and non-auxiliary 
matches will be
-               // preferred in highlighting.
-               $formatter->remove( $this->auxiliaryElementSelectors );
-               $auxiliaryElements = $formatter->filterContent();
-               $allText = trim( Sanitizer::stripAllTags( $formatter->getText() 
) );
-               $auxiliary = array();
-               foreach ( $auxiliaryElements as $auxiliaryElement ) {
-                       $auxiliary[] = trim( Sanitizer::stripAllTags( 
$formatter->getText( $auxiliaryElement ) ) );
-               }
-
-               return array( $allText, $opening, $auxiliary );
-       }
-
-       /**
-        * @param string $text
-        * @return string|null
-        */
-       private function extractHeadingBeforeFirstHeading( $text ) {
-               $matches = array();
-               if ( !preg_match( '/<h[123456]>/', $text, $matches, 
PREG_OFFSET_CAPTURE ) ) {
-                       // There isn't a first heading so we interpret this as 
the article
-                       // being entirely without heading.
-                       return null;
-               }
-               $text = substr( $text, 0, $matches[ 0 ][ 1 ] );
-               if ( !$text ) {
-                       // There isn't any text before the first heading so we 
declare there isn't
-                       // a first heading.
-                       return null;
-               }
-
-               $formatter = new HtmlFormatter( $text );
-               $formatter->remove( $this->excludedElementSelectors );
-               $formatter->remove( $this->auxiliaryElementSelectors );
-               $formatter->filterContent();
-               $text = trim( Sanitizer::stripAllTags( $formatter->getText() ) 
);
-
-               if ( !$text ) {
-                       // There isn't any text after filtering before the 
first heading so we declare
-                       // that there isn't a first heading.
-                       return null;
-               }
-
-               return $text;
-       }
-}
diff --git a/includes/Maintenance/MappingConfigBuilder.php 
b/includes/Maintenance/MappingConfigBuilder.php
index 6fb5c23..c7795b4 100644
--- a/includes/Maintenance/MappingConfigBuilder.php
+++ b/includes/Maintenance/MappingConfigBuilder.php
@@ -296,9 +296,8 @@
         * @param array $extra Extra analyzers for this field beyond the basic 
text and plain.
         * @return TextIndexField definition of the field
         */
-       public function buildStringField( $fieldName, $options = null, $extra = 
[] ) {
-               $field =
-                       new TextIndexField( $fieldName, 
SearchIndexField::INDEX_TYPE_TEXT, $this->config,
+       protected function buildStringField( $fieldName, $options = null, 
$extra = [] ) {
+               $field = new TextIndexField( $fieldName, 
SearchIndexField::INDEX_TYPE_TEXT, $this->config,
                                $extra );
                $field->setTextOptions( $options );
                return $field;
@@ -309,7 +308,7 @@
         * @param string $name Field name
         * @return IntegerIndexField
         */
-       public function buildLongField( $name ) {
+       protected function buildLongField( $name ) {
                return new IntegerIndexField( $name, 
SearchIndexField::INDEX_TYPE_INTEGER, $this->config );
        }
 
@@ -318,7 +317,7 @@
         * @param string $name Field name
         * @return KeywordIndexField
         */
-       public function buildKeywordField( $name ) {
+       protected function buildKeywordField( $name ) {
                return new KeywordIndexField( $name, 
SearchIndexField::INDEX_TYPE_KEYWORD, $this->config );
        }
 }
diff --git a/includes/Updater.php b/includes/Updater.php
index 79bd28b..d8481ca 100644
--- a/includes/Updater.php
+++ b/includes/Updater.php
@@ -2,9 +2,6 @@
 
 namespace CirrusSearch;
 
-use CirrusSearch\BuildDocument\FileDataBuilder;
-use CirrusSearch\BuildDocument\PageDataBuilder;
-use CirrusSearch\BuildDocument\PageTextBuilder;
 use Hooks as MWHooks;
 use MediaWiki\Logger\LoggerFactory;
 use ParserCache;
@@ -358,33 +355,6 @@
                }
 
                return $script;
-       }
-
-       /**
-        * Fetch page's content and parser output, using the parser cache if we 
can
-        *
-        * @param WikiPage $page The wikipage to get output for
-        * @param int $forceParse Bypass ParserCache and force a fresh parse.
-        * @return array(Content,ParserOutput)
-        */
-       private function getContentAndParserOutput( $page, $forceParse ) {
-               $content = $page->getContent();
-               $parserOptions = $page->makeParserOptions( 'canonical' );
-
-               if ( !$forceParse ) {
-                       $parserOutput = ParserCache::singleton()->get( $page, 
$parserOptions );
-               }
-
-               if ( !isset( $parserOutput ) || !$parserOutput instanceof 
ParserOutput ) {
-                       // We specify the revision ID here. There might be a 
newer revision,
-                       // but we don't care because (a) we've already got a 
job somewhere
-                       // in the queue to index it, and (b) we want magic 
words like
-                       // {{REVISIONUSER}} to be accurate
-                       $revId = $page->getRevision()->getId();
-                       $parserOutput = $content->getParserOutput( 
$page->getTitle(), $revId );
-               }
-               /** @suppress PhanUndeclaredVariable */
-               return array( $content, $parserOutput );
        }
 
        /**

-- 
To view, visit https://gerrit.wikimedia.org/r/294403
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Id284d253629be9639cd1476a7d87c8bef3345a86
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Smalyshev <smalys...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to