Phoenix303 has uploaded a new change for review. https://gerrit.wikimedia.org/r/222074
Change subject: Cross language search using MessageCollection ...................................................................... Cross language search using MessageCollection Steps: 1. Used filtered query to search for a string in a source language. Suppose source language is finnish, search for 'Aloita', it returns all finnish messages with 'Aloita'. 2. Create message definitions for all message keys found in step 1. 3. Filter to get all the translated messages in the selected target language. By default target language=source language. 4. Use makeFacets() to build language and group facets. This patch uses MessageCollection, alternate way to achieve the same without MessageCollection is described in https://gerrit.wikimedia.org/r/#/c/218859/ Change-Id: I0cd190a87c19318a79a124318183ca4d4d8a07b7 --- M specials/SpecialSearchTranslations.php M ttmserver/ElasticSearchTTMServer.php 2 files changed, 122 insertions(+), 48 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Translate refs/changes/74/222074/1 diff --git a/specials/SpecialSearchTranslations.php b/specials/SpecialSearchTranslations.php index 37b2344..1ef394c 100644 --- a/specials/SpecialSearchTranslations.php +++ b/specials/SpecialSearchTranslations.php @@ -80,6 +80,11 @@ return; } + if ( $opts->getValue( 'language' ) === '' ) { + $language = $this->getLanguage()->getCode(); + $opts->add( 'language', $language ); + } + try { $resultset = $server->search( $queryString, $opts, $this->hl ); } catch ( TTMServerException $e ) { @@ -87,7 +92,36 @@ throw new ErrorPageError( 'tux-sst-solr-offline-title', 'tux-sst-solr-offline-body' ); } + $messages = $documents = $terms = array(); + $language = $opts->getValue( 'language' ); + foreach ( $resultset->getResults() as $document ) { + $data = $document->getData(); + $localid = explode( ':', $data['localid'] ); + $namespace = strtoupper( "NS_" . $localid[0] ); + $key = implode( ':', array( constant( $namespace ), $localid[1] ) ); + $messages[$key] = $data['content']; + $terms[] = $data['localid']; + } + + + $definitions = new MessageDefinitions( $messages ); + $collection = MessageCollection::newFromDefinitions( $definitions, $language ); + $collection->filter( 'translated', false ); + + $off = $collection->slice( $opts->getValue('offset'), $this->limit ); + $collection->loadTranslations(); + + foreach ( $collection->keys() as $mkey => $title ) { + $documents[$mkey]['title'] = $title; + $documents[$mkey]['definition'] = $messages[$mkey]; + $documents[$mkey]['translation'] = $collection[$mkey]->translation(); + } + // Part 1: facets + if ( method_exists( $server, 'makeFacets' ) ) { + $resultset = $server->makeFacets( $terms, $opts ); + } + $facets = $server->getFacets( $resultset ); $total = $server->getTotalHits( $resultset ); $facetHtml = ''; @@ -111,39 +145,37 @@ // Part 2: results $resultsHtml = ''; - $documents = $server->getDocuments( $resultset ); - foreach ( $documents as $document ) { - $text = $document['content']; + foreach ( $documents as $mkey => $values ) { + $text = $documents[$mkey]['translation']; $text = TranslateUtils::convertWhiteSpaceToHTML( $text ); list( $pre, $post ) = $this->hl; $text = str_replace( $pre, '<strong class="tux-highlight">', $text ); $text = str_replace( $post, '</strong>', $text ); - $title = Title::newFromText( $document['localid'] . '/' . $document['language'] ); - if ( !$title ) { + if ( !$documents[$mkey]['title'] ) { // Should not ever happen but who knows... continue; } $resultAttribs = array( 'class' => 'row tux-message', - 'data-title' => $title->getPrefixedText(), - 'data-language' => $document['language'], + 'data-title' => $documents[$mkey]['title']->getPrefixedText(), + 'data-language' => $language, ); - $handle = new MessageHandle( $title ); + $handle = new MessageHandle( $documents[$mkey]['title'] ); $edit = ''; if ( $handle->isValid() ) { $groupId = $handle->getGroup()->getId(); - $helpers = new TranslationHelpers( $title, $groupId ); + $helpers = new TranslationHelpers( $documents[$mkey]['title'], $groupId ); $resultAttribs['data-definition'] = $helpers->getDefinition(); $resultAttribs['data-translation'] = $helpers->getTranslation(); $resultAttribs['data-group'] = $groupId; - $uri = wfAppendQuery( $document['uri'], array( 'action' => 'edit' ) ); + $uri = wfAppendQuery( $handle->getTitle()->getCanonicalUrl(), array( 'action' => 'edit' ) ); $link = Html::element( 'a', array( 'href' => $uri, ), $this->msg( 'tux-sst-edit' )->text() ); @@ -154,7 +186,7 @@ ); } - $titleText = $title->getPrefixedText(); + $titleText = $documents[$mkey]['title']->getPrefixedText(); $titleAttribs = array( 'class' => 'row tux-title', 'dir' => 'ltr', @@ -162,8 +194,8 @@ $textAttribs = array( 'class' => 'row tux-text', - 'lang' => wfBCP47( $document['language'] ), - 'dir' => Language::factory( $document['language'] )->getDir(), + 'lang' => wfBCP47( $language ), + 'dir' => Language::factory( $language )->getDir(), ); $resultsHtml = $resultsHtml diff --git a/ttmserver/ElasticSearchTTMServer.php b/ttmserver/ElasticSearchTTMServer.php index c0dbb55..5b047ed 100644 --- a/ttmserver/ElasticSearchTTMServer.php +++ b/ttmserver/ElasticSearchTTMServer.php @@ -465,42 +465,94 @@ // Allow searching either by message content or message id (page name // without language subpage) with exact match only. - $serchQuery = new \Elastica\Query\Bool(); + $searchQuery = new \Elastica\Query\Bool(); $contentQuery = new \Elastica\Query\Match(); $contentQuery->setFieldQuery( 'content', $queryString ); - $serchQuery->addShould( $contentQuery ); + $searchQuery->addShould( $contentQuery ); $messageQuery = new \Elastica\Query\Term(); $messageQuery->setTerm( 'localid', $queryString ); - $serchQuery->addShould( $messageQuery ); - $query->setQuery( $serchQuery ); + $searchQuery->addShould( $messageQuery ); + $filteredQuery = new \Elastica\Query\Filtered(); + $filterbool = new \Elastica\Filter\Bool(); + + $context = RequestContext::getMain(); + $languageCode = $context->getLanguage()->getCode(); + + $languageFilter = new \Elastica\Filter\Term(); + $languageFilter->setTerm( 'language', $languageCode ); + $filterbool->addMust( $languageFilter ); + + $group = $opts->getValue( 'group' ); + if ( $group !== '' ) { + $groupFilter = new \Elastica\Filter\Term(); + $groupFilter->setTerm( 'group', $group ); + $filterbool->addMust( $groupFilter ); + } + + $filteredQuery->setFilter($filterbool); + $filteredQuery->setQuery($searchQuery); + + $query->setQuery( $filteredQuery ); + $query->setParam( '_source', array( 'localid', 'group', 'content' ) ); + + list( $pre, $post ) = $highlight; + $query->setHighlight( array( + // The value must be an object + 'fields' => array( + $contentString => array( + 'number_of_fragments' => 0, + ), + ), + 'pre_tags' => array( $pre ), + 'post_tags' => array( $post ), + ) ); + + $query->setFrom( 0 ); + $query->setSize( 500 ); + do { + try { + $resultset = $this->getType()->getIndex()->search( $query ); + } catch ( \Elastica\Exception\ExceptionInterface $e ) { + throw new TTMServerException( $e->getMessage() ); + } + $size = $query->getParam( 'size' ); + $query->setSize( $resultset->getTotalHits() ); + + } while ( $resultset->getTotalHits() > $size ); + return $resultset; + } + + // Fetch data for facets counts + public function makeFacets( $terms, $opts ) { + + $filteredQuery = new \Elastica\Query\Filtered(); + $idQuery = new \Elastica\Filter\Terms(); + $idQuery->setTerms( 'localid', $terms ); + + $filteredQuery->setFilter($idQuery); + $query = new \Elastica\Query(); + + $query->setQuery( $filteredQuery ); + + // Language facet to retrieve count for each language $language = new \Elastica\Facet\Terms( 'language' ); $language->setField( 'language' ); - $language->setSize( 500 ); + $language->setSize( 600 ); $query->addFacet( $language ); + // Group facet to retrieve count for each group $group = new \Elastica\Facet\Terms( 'group' ); $group->setField( 'group' ); - // Would like to prioritize the top level groups and not show subgroups - // if the top group has only few hits, but that doesn't seem to be possile. $group->setSize( 500 ); $query->addFacet( $group ); - $query->setSize( $opts->getValue( 'limit' ) ); - $query->setFrom( $opts->getValue( 'offset' ) ); - - // BoolAnd filters are executed in sequence per document. Bool filters with - // multiple must clauses are executed by converting each filter into a bit - // field then anding them together. The latter is normally faster if either - // of the subfilters are reused. May not make a difference in this context. $filters = new \Elastica\Filter\Bool(); $language = $opts->getValue( 'language' ); - if ( $language !== '' ) { - $languageFilter = new \Elastica\Filter\Term(); - $languageFilter->setTerm( 'language', $language ); - $filters->addMust( $languageFilter ); - } + $languageFilter = new \Elastica\Filter\Term(); + $languageFilter->setTerm( 'language', $language ); + $filters->addMust( $languageFilter ); $group = $opts->getValue( 'group' ); if ( $group !== '' ) { @@ -508,23 +560,13 @@ $groupFilter->setTerm( 'group', $group ); $filters->addMust( $groupFilter ); } + $query->setFilter( $filters ); - // Check that we have at least one filter to avoid invalid query errors. - if ( $language !== '' || $group !== '' ) { - $query->setFilter( $filters ); - } - - list( $pre, $post ) = $highlight; - $query->setHighlight( array( - // The value must be an object - 'fields' => array( - 'content' => array( - 'number_of_fragments' => 0, - ), - ), - 'pre_tags' => array( $pre ), - 'post_tags' => array( $post ), - ) ); + $offset = $opts->getValue( 'offset' ); + $limit = $opts->getValue( 'limit' ); + $query->setFrom( $offset ); + $query->setSize( $limit ); + $query->setParam( '_source', array( 'content', 'localid', 'language', 'group', 'wiki' ) ); try { return $this->getType()->getIndex()->search( $query ); -- To view, visit https://gerrit.wikimedia.org/r/222074 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I0cd190a87c19318a79a124318183ca4d4d8a07b7 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/Translate Gerrit-Branch: master Gerrit-Owner: Phoenix303 <divyalife...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits