Smalyshev has uploaded a new change for review. https://gerrit.wikimedia.org/r/294403
Change subject: Cleanup code that has been moved. ...................................................................... Cleanup code that has been moved. Change-Id: Id284d253629be9639cd1476a7d87c8bef3345a86 Depends-On: Ie45de496ecc826211d98eea3a410c7639b4be0a4 Bug: T89733 --- M autoload.php D includes/BuildDocument/FileDataBuilder.php D includes/BuildDocument/PageDataBuilder.php D includes/BuildDocument/PageTextBuilder.php M includes/Maintenance/MappingConfigBuilder.php M includes/Updater.php 6 files changed, 4 insertions(+), 425 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/03/294403/1 diff --git a/autoload.php b/autoload.php index d8d3ad5..2488bc9 100644 --- a/autoload.php +++ b/autoload.php @@ -12,11 +12,8 @@ 'CirrusSearch\\Api\\SettingsDump' => __DIR__ . '/includes/Api/SettingsDump.php', 'CirrusSearch\\Api\\SuggestIndex' => __DIR__ . '/includes/Api/SuggestIndex.php', 'CirrusSearch\\BuildDocument\\Builder' => __DIR__ . '/includes/BuildDocument/Builder.php', - 'CirrusSearch\\BuildDocument\\FileDataBuilder' => __DIR__ . '/includes/BuildDocument/FileDataBuilder.php', 'CirrusSearch\\BuildDocument\\IncomingLinksScoringMethod' => __DIR__ . '/includes/BuildDocument/SuggestScoring.php', 'CirrusSearch\\BuildDocument\\PQScore' => __DIR__ . '/includes/BuildDocument/SuggestScoring.php', - 'CirrusSearch\\BuildDocument\\PageDataBuilder' => __DIR__ . '/includes/BuildDocument/PageDataBuilder.php', - 'CirrusSearch\\BuildDocument\\PageTextBuilder' => __DIR__ . '/includes/BuildDocument/PageTextBuilder.php', 'CirrusSearch\\BuildDocument\\ParseBuilder' => __DIR__ . '/includes/BuildDocument/Builder.php', 'CirrusSearch\\BuildDocument\\QualityScore' => __DIR__ . '/includes/BuildDocument/SuggestScoring.php', 'CirrusSearch\\BuildDocument\\RedirectsAndIncomingLinks' => __DIR__ . '/includes/BuildDocument/RedirectsAndIncomingLinks.php', diff --git a/includes/BuildDocument/FileDataBuilder.php b/includes/BuildDocument/FileDataBuilder.php deleted file mode 100644 index 5db95a3..0000000 --- a/includes/BuildDocument/FileDataBuilder.php +++ /dev/null @@ -1,52 +0,0 @@ -<?php - -namespace CirrusSearch\BuildDocument; - -use LocalFile; - -/** - * Add file metadata-type stuff to a document - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - * http://www.gnu.org/copyleft/gpl.html - */ - -class FileDataBuilder extends Builder { - /** - * @var LocalFile - */ - private $file; - - /** - * @return \Elastica\Document - */ - public function build() { - $this->file = wfLocalFile( $this->title ); - if ( $this->file && $this->file->exists() ) { - $this->fileText(); - } - - return $this->doc; - } - - private function fileText() { - if ( $this->file->getHandler() ) { - $fileText = $this->file->getHandler()->getEntireText( $this->file ); - if ( $fileText ) { - $this->doc->set( 'file_text', $fileText ); - } - } - } -} diff --git a/includes/BuildDocument/PageDataBuilder.php b/includes/BuildDocument/PageDataBuilder.php deleted file mode 100644 index 04a1e3d..0000000 --- a/includes/BuildDocument/PageDataBuilder.php +++ /dev/null @@ -1,160 +0,0 @@ -<?php - -namespace CirrusSearch\BuildDocument; - -use Category; -use Sanitizer; -use Title; -use CirrusSearch\Util; - -/** - * Add everything to a page that doesn't require page text. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - * http://www.gnu.org/copyleft/gpl.html - */ - -class PageDataBuilder extends ParseBuilder { - /** - * @return \Elastica\Document - */ - public function build() { - switch ( $this->content->getModel() ) { - case CONTENT_MODEL_CSS: - case CONTENT_MODEL_JAVASCRIPT: - // Don't use parser output here. It's useless and leads - // to weird results. Instead, clear everything. See bug 61752. - $this->doc->set( 'category', array() ); - $this->doc->set( 'external_link', array() ); - $this->doc->set( 'heading', array() ); - $this->doc->set( 'outgoing_link', array() ); - $this->doc->set( 'template', array() ); - break; - default: - $this->categories(); - $this->externalLinks(); - $this->headings(); - $this->outgoingLinks(); - $this->templates(); - $this->setWikibaseItemId(); - } - - // All content types have a language - $this->doc->set( 'language', - $this->title->getPageLanguage()->getCode() ); - - return $this->doc; - } - - private function categories() { - $categories = array(); - foreach ( array_keys( $this->parserOutput->getCategories() ) as $key ) { - $categories[] = Category::newFromName( $key )->getTitle()->getText(); - } - $this->doc->set( 'category', $categories ); - } - - private function externalLinks() { - $this->doc->set( 'external_link', - array_keys( $this->parserOutput->getExternalLinks() ) - ); - } - - private function outgoingLinks() { - $outgoingLinks = array(); - foreach ( $this->parserOutput->getLinks() as $linkedNamespace => $namespaceLinks ) { - foreach ( array_keys( $namespaceLinks ) as $linkedDbKey ) { - $outgoingLinks[] = - Title::makeTitle( $linkedNamespace, $linkedDbKey )->getPrefixedDBkey(); - } - } - $this->doc->set( 'outgoing_link', $outgoingLinks ); - } - - private function templates() { - $templates = array(); - foreach ( $this->parserOutput->getTemplates() as $tNS => $templatesInNS ) { - foreach ( array_keys( $templatesInNS ) as $tDbKey ) { - $templateTitle = Title::makeTitleSafe( $tNS, $tDbKey ); - if ( $templateTitle && $templateTitle->exists() ) { - $templates[] = $templateTitle->getPrefixedText(); - } - } - } - $this->doc->set( 'template', $templates ); - } - - private function headings() { - $headings = array(); - $ignoredHeadings = $this->getIgnoredHeadings(); - foreach ( $this->parserOutput->getSections() as $heading ) { - $heading = $heading[ 'line' ]; - // First strip out things that look like references. We can't use HTML filtering because - // the references come back as <sup> tags without a class. To keep from breaking stuff like - // ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''<sup>2</sup>== - // we don't remove the whole <sup> tag. We also don't want to strip the <sup> tag and remove - // everything that looks like [2] because, I dunno, maybe there is a band named Word [2] Foo - // or something. Whatever. So we only strip things that look like <sup> tags wrapping a - // reference. And since the data looks like: - // Reference in heading <sup>[1]</sup><sup>[2]</sup> - // we can not really use HtmlFormatter as we have no suitable selector. - - // Some wikis wrap the brackets in a span: - // http://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link - $heading = preg_replace( '/<\/?span>/', '', $heading ); - // Normalize [] so the following regexp would work. - $heading = preg_replace( [ '/[/', '/]/' ], [ '[', ']' ], $heading ); - $heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/is', '', $heading ); - - // Strip tags from the heading or else we'll display them (escaped) in search results - $heading = trim( Sanitizer::stripAllTags( $heading ) ); - - // Note that we don't take the level of the heading into account - all headings are equal. - // Except the ones we ignore. - if ( !in_array( $heading, $ignoredHeadings ) ) { - $headings[] = $heading; - } - } - $this->doc->set( 'heading', $headings ); - } - - /** - * @return string[] - */ - private function getIgnoredHeadings() { - static $ignoredHeadings = null; - if ( $ignoredHeadings === null ) { - $source = wfMessage( 'cirrussearch-ignored-headings' )->inContentLanguage(); - $ignoredHeadings = array(); - if( !$source->isDisabled() ) { - $lines = Util::parseSettingsInMessage( $source->plain() ); - $ignoredHeadings = $lines; // Now we just have headings! - } - } - return $ignoredHeadings; - } - - /** - * Add item id of a page's connected Wikibase item (if available) - * - * @todo move this to Wikibase - */ - private function setWikibaseItemId() { - $wikibaseItem = $this->parserOutput->getProperty( 'wikibase_item' ); - if ( $wikibaseItem !== false ) { - $this->doc->set( 'wikibase_item', $wikibaseItem ); - } - } -} diff --git a/includes/BuildDocument/PageTextBuilder.php b/includes/BuildDocument/PageTextBuilder.php deleted file mode 100644 index 424161c..0000000 --- a/includes/BuildDocument/PageTextBuilder.php +++ /dev/null @@ -1,175 +0,0 @@ -<?php - -namespace CirrusSearch\BuildDocument; - -use Content; -use HtmlFormatter\HtmlFormatter; -use MediaWiki\Logger\LoggerFactory; -use ParserOutput; -use Sanitizer; - -/** - * Adds fields to the document that require article text. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - * http://www.gnu.org/copyleft/gpl.html - */ -class PageTextBuilder extends ParseBuilder { - /** - * @var string[] selectors to elements that are excluded entirely from search - */ - private $excludedElementSelectors = array( - 'audio', 'video', // "it looks like you don't have javascript enabled..." do not need to index - 'sup.reference', // The [1] for references - '.mw-cite-backlink', // The ↑ next to references in the references section - 'h1', 'h2', 'h3', // Headings are already indexed in their own field. - 'h5', 'h6', 'h4', - '.autocollapse', // Collapsed fields are hidden by default so we don't want them showing up. - ); - /** - * @var string[] selectors to elements that are considered auxiliary to article text for search - */ - private $auxiliaryElementSelectors = array( - '.thumbcaption', // Thumbnail captions aren't really part of the text proper - 'table', // Neither are tables - '.rellink', // Common style for "See also:". - '.dablink', // Common style for calling out helpful links at the top of the article. - '.searchaux', // New class users can use to mark stuff as auxiliary to searches. - ); - - /** - * @param \Elastica\Document $doc - * @param Content $content - * @param ParserOutput $parserOutput - */ - public function __construct( \Elastica\Document $doc, Content $content, ParserOutput $parserOutput ) { - parent::__construct( $doc, null, $content, $parserOutput ); - } - - /** - * @return \Elastica\Document - */ - public function build() { - list( $text, $opening, $auxiliary ) = $this->buildTextToIndex(); - $this->doc->set( 'text', $text ); - $this->doc->set( 'opening_text', $opening ); - $this->doc->set( 'auxiliary_text', $auxiliary ); - $this->doc->set( 'text_bytes', $this->content->getSize() ); - $this->doc->set( 'source_text', $this->content->getTextForSearchIndex() ); - - return $this->doc; - } - - /** - * Fetch text to index. If $content is wikitext then render and strip things from it. - * Otherwise delegate to the $content itself. - * - * @return array Three tuple of (text, opening, auxiliary). Text is always string. Opening - * is string or null. Auxiliary is string[]. - */ - private function buildTextToIndex() { - switch ( $this->content->getModel() ) { - case CONTENT_MODEL_WIKITEXT: - return $this->formatWikitext( $this->parserOutput ); - default: - $text = $this->content->getTextForSearchIndex(); - return array( $text, null, array() ); - } - } - - /** - * Get text to index from a ParserOutput assuming the content was wikitext. - * - * @param ParserOutput $parserOutput The parsed wikitext's parser output - * @return array who's first entry is text and second is opening text, and third is an - * array of auxiliary text - */ - private function formatWikitext( ParserOutput $parserOutput ) { - global $wgCirrusSearchBoostOpening; - - $parserOutput->setEditSectionTokens( false ); - $parserOutput->setTOCEnabled( false ); - $text = $parserOutput->getText(); - $opening = null; - - switch ( $wgCirrusSearchBoostOpening ) { - case 'first_heading': - $opening = $this->extractHeadingBeforeFirstHeading( $text ); - break; - case 'none': - break; - default: - LoggerFactory::getInstance( 'CirrusSearch' )->warning( - "Invalid value for \$wgCirrusSearchBoostOpening: {wgCirrusSearchBoostOpening}", - array( 'wgCirrusSearchBoostOpening' => $wgCirrusSearchBoostOpening ) - ); - } - - // Add extra spacing around break tags so text crammed together like<br>this doesn't make one word. - $text = str_replace( '<br', "\n<br", $text ); - - $formatter = new HtmlFormatter( $text ); - - // Strip elements from the page that we never want in the search text. - $formatter->remove( $this->excludedElementSelectors ); - $formatter->filterContent(); - - // Strip elements from the page that are auxiliary text. These will still be - // searched but matches will be ranked lower and non-auxiliary matches will be - // preferred in highlighting. - $formatter->remove( $this->auxiliaryElementSelectors ); - $auxiliaryElements = $formatter->filterContent(); - $allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) ); - $auxiliary = array(); - foreach ( $auxiliaryElements as $auxiliaryElement ) { - $auxiliary[] = trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) ); - } - - return array( $allText, $opening, $auxiliary ); - } - - /** - * @param string $text - * @return string|null - */ - private function extractHeadingBeforeFirstHeading( $text ) { - $matches = array(); - if ( !preg_match( '/<h[123456]>/', $text, $matches, PREG_OFFSET_CAPTURE ) ) { - // There isn't a first heading so we interpret this as the article - // being entirely without heading. - return null; - } - $text = substr( $text, 0, $matches[ 0 ][ 1 ] ); - if ( !$text ) { - // There isn't any text before the first heading so we declare there isn't - // a first heading. - return null; - } - - $formatter = new HtmlFormatter( $text ); - $formatter->remove( $this->excludedElementSelectors ); - $formatter->remove( $this->auxiliaryElementSelectors ); - $formatter->filterContent(); - $text = trim( Sanitizer::stripAllTags( $formatter->getText() ) ); - - if ( !$text ) { - // There isn't any text after filtering before the first heading so we declare - // that there isn't a first heading. - return null; - } - - return $text; - } -} diff --git a/includes/Maintenance/MappingConfigBuilder.php b/includes/Maintenance/MappingConfigBuilder.php index 6fb5c23..c7795b4 100644 --- a/includes/Maintenance/MappingConfigBuilder.php +++ b/includes/Maintenance/MappingConfigBuilder.php @@ -296,9 +296,8 @@ * @param array $extra Extra analyzers for this field beyond the basic text and plain. * @return TextIndexField definition of the field */ - public function buildStringField( $fieldName, $options = null, $extra = [] ) { - $field = - new TextIndexField( $fieldName, SearchIndexField::INDEX_TYPE_TEXT, $this->config, + protected function buildStringField( $fieldName, $options = null, $extra = [] ) { + $field = new TextIndexField( $fieldName, SearchIndexField::INDEX_TYPE_TEXT, $this->config, $extra ); $field->setTextOptions( $options ); return $field; @@ -309,7 +308,7 @@ * @param string $name Field name * @return IntegerIndexField */ - public function buildLongField( $name ) { + protected function buildLongField( $name ) { return new IntegerIndexField( $name, SearchIndexField::INDEX_TYPE_INTEGER, $this->config ); } @@ -318,7 +317,7 @@ * @param string $name Field name * @return KeywordIndexField */ - public function buildKeywordField( $name ) { + protected function buildKeywordField( $name ) { return new KeywordIndexField( $name, SearchIndexField::INDEX_TYPE_KEYWORD, $this->config ); } } diff --git a/includes/Updater.php b/includes/Updater.php index 79bd28b..d8481ca 100644 --- a/includes/Updater.php +++ b/includes/Updater.php @@ -2,9 +2,6 @@ namespace CirrusSearch; -use CirrusSearch\BuildDocument\FileDataBuilder; -use CirrusSearch\BuildDocument\PageDataBuilder; -use CirrusSearch\BuildDocument\PageTextBuilder; use Hooks as MWHooks; use MediaWiki\Logger\LoggerFactory; use ParserCache; @@ -358,33 +355,6 @@ } return $script; - } - - /** - * Fetch page's content and parser output, using the parser cache if we can - * - * @param WikiPage $page The wikipage to get output for - * @param int $forceParse Bypass ParserCache and force a fresh parse. - * @return array(Content,ParserOutput) - */ - private function getContentAndParserOutput( $page, $forceParse ) { - $content = $page->getContent(); - $parserOptions = $page->makeParserOptions( 'canonical' ); - - if ( !$forceParse ) { - $parserOutput = ParserCache::singleton()->get( $page, $parserOptions ); - } - - if ( !isset( $parserOutput ) || !$parserOutput instanceof ParserOutput ) { - // We specify the revision ID here. There might be a newer revision, - // but we don't care because (a) we've already got a job somewhere - // in the queue to index it, and (b) we want magic words like - // {{REVISIONUSER}} to be accurate - $revId = $page->getRevision()->getId(); - $parserOutput = $content->getParserOutput( $page->getTitle(), $revId ); - } - /** @suppress PhanUndeclaredVariable */ - return array( $content, $parserOutput ); } /** -- To view, visit https://gerrit.wikimedia.org/r/294403 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Id284d253629be9639cd1476a7d87c8bef3345a86 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/CirrusSearch Gerrit-Branch: master Gerrit-Owner: Smalyshev <smalys...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits