EBernhardson has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/392471 )
Change subject: [WIP] Add word ount statistic for articles
......................................................................
[WIP] Add word ount statistic for articles
The community survey asked for this feature, and it was pretty
straight forward to add to cirrus.
Change-Id: I847f696405b447ab04972ad0215c09d0012c2098
---
M CirrusSearch.php
M autoload.php
M includes/CirrusSearch.php
M includes/Hooks.php
A includes/Query/CountContentWordsBuilder.php
M includes/Search/ResultsType.php
M includes/Search/SearchContext.php
M includes/Search/SearchRequestBuilder.php
M includes/Searcher.php
9 files changed, 131 insertions(+), 4 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch
refs/changes/71/392471/1
diff --git a/CirrusSearch.php b/CirrusSearch.php
index b7a8682..094d656 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -1302,6 +1302,7 @@
$wgHooks[ 'SoftwareInfo' ][] = 'CirrusSearch\Hooks::onSoftwareInfo';
$wgHooks[ 'SpecialSearchResults' ][] =
'CirrusSearch\Hooks::onSpecialSearchResults';
$wgHooks[ 'SpecialSearchResultsAppend' ][] =
'CirrusSearch\Hooks::onSpecialSearchResultsAppend';
+$wgHooks[ 'SpecialStatsAddExtra'][] =
'CirrusSearch\Hooks::onSpecialStatsAddExtra';
$wgHooks[ 'TitleMove' ][] = 'CirrusSearch\Hooks::onTitleMove';
$wgHooks[ 'TitleMoveComplete' ][] = 'CirrusSearch\Hooks::onTitleMoveComplete';
$wgHooks[ 'UnitTestsList' ][] = 'CirrusSearch\Hooks::onUnitTestsList';
diff --git a/autoload.php b/autoload.php
index 094cff9..de0770f 100644
--- a/autoload.php
+++ b/autoload.php
@@ -115,6 +115,7 @@
'CirrusSearch\\Query\\BoostTemplatesFeature' => __DIR__ .
'/includes/Query/BoostTemplatesFeature.php',
'CirrusSearch\\Query\\CompSuggestQueryBuilder' => __DIR__ .
'/includes/Query/CompSuggestQueryBuilder.php',
'CirrusSearch\\Query\\ContentModelFeature' => __DIR__ .
'/includes/Query/ContentModelFeature.php',
+ 'CirrusSearch\\Query\\CountContentWordsBuilder' => __DIR__ .
'/includes/Query/CountContentWordsBuilder.php',
'CirrusSearch\\Query\\FileNumericFeature' => __DIR__ .
'/includes/Query/FileNumericFeature.php',
'CirrusSearch\\Query\\FileTypeFeature' => __DIR__ .
'/includes/Query/FileTypeFeature.php',
'CirrusSearch\\Query\\FullTextQueryBuilder' => __DIR__ .
'/includes/Query/FullTextQueryBuilder.php',
@@ -195,6 +196,7 @@
'CirrusSearch\\Search\\SearchMetricsProvider' => __DIR__ .
'/includes/Search/SearchMetricsProvider.php',
'CirrusSearch\\Search\\SearchRequestBuilder' => __DIR__ .
'/includes/Search/SearchRequestBuilder.php',
'CirrusSearch\\Search\\ShortTextIndexField' => __DIR__ .
'/includes/Search/ShortTextIndexField.php',
+ 'CirrusSearch\\Search\\SingleAggResultsType' => __DIR__ .
'/includes/Search/ResultsType.php',
'CirrusSearch\\Search\\SourceTextIndexField' => __DIR__ .
'/includes/Search/SourceTextIndexField.php',
'CirrusSearch\\Search\\StaticCrossProjectBlockScorer' => __DIR__ .
'/includes/Search/CrossProjectBlockScorer.php',
'CirrusSearch\\Search\\TeamDraftInterleaver' => __DIR__ .
'/includes/Search/TeamDraftInterleaver.php',
diff --git a/includes/CirrusSearch.php b/includes/CirrusSearch.php
index 089589a..52b8e3f 100644
--- a/includes/CirrusSearch.php
+++ b/includes/CirrusSearch.php
@@ -798,10 +798,7 @@
return Status::newGood( [] );
}
- $searcher = new Searcher( $this->connection, $this->offset,
$this->limit, $this->config, $this->namespaces,
- null, $this->indexBaseName );
- $searcher->setOptionsFromRequest( $this->request );
-
+ $searcher = $this->makeSearcher();
$status = $searcher->searchArchive( $term );
if ( $status->isOK() && $searcher->isReturnRaw() ) {
$status->setResult( true,
@@ -810,4 +807,22 @@
return $status;
}
+ public function countContentWords() {
+ $this->limit = 1;
+ $searcher = $this->makeSearcher();
+ $status = $searcher->countContentWords();
+
+ if ( $status->isOK() && $searcher->isReturnRaw() ) {
+ $status->setResult( true,
+ $searcher->processRawReturn(
$status->getValue(), $this->request, $this->dumpAndDie ) );
+ }
+ return $status;
+ }
+
+ private function makeSearcher() {
+ $searcher = new Searcher( $this->connection, $this->offset,
$this->limit, $this->config, $this->namespaces,
+ null, $this->indexBaseName );
+ $searcher->setOptionsFromRequest( $this->request );
+ return $searcher;
+ }
}
diff --git a/includes/Hooks.php b/includes/Hooks.php
index 44cbc9d..8b783f8 100644
--- a/includes/Hooks.php
+++ b/includes/Hooks.php
@@ -853,4 +853,12 @@
return true;
}
+ public static function onSpecialStatsAddExtra( &$extraStats, $context )
{
+ $search = new CirrusSearch();
+
+ $status = $search->countContentWords();
+ if ( $status->isOK() ) {
+ $extraStats['cirrussearch-article-words'] =
$status->getValue();
+ }
+ }
}
diff --git a/includes/Query/CountContentWordsBuilder.php
b/includes/Query/CountContentWordsBuilder.php
new file mode 100644
index 0000000..30fff10
--- /dev/null
+++ b/includes/Query/CountContentWordsBuilder.php
@@ -0,0 +1,26 @@
+<?php
+
+namespace CirrusSearch\Query;
+
+use Elastica\Aggregation\Sum;
+use Elastica\Query;
+use CirrusSearch\Connection;
+use CirrusSearch\Search\SingleAggResultsType;
+use CirrusSearch\Search\SearchContext;
+
+/**
+ * Build a query suited for exact title/redirect match.
+ */
+class CountContentWordsBuilder {
+
+ /**
+ * @param SearchContext $searchContext the search context
+ */
+ public function build( SearchContext $context ) {
+ $context->setSearchType( 'sum_word_count' );
+ $context->setResultsType( new SingleAggResultsType(
'word_count' ) );
+ $context->setRescoreProfile( 'empty' );
+ $context->addAggregation(
+ ( new Sum( 'word_count' ) )->setField(
'text.word_count' ) );
+ }
+}
diff --git a/includes/Search/ResultsType.php b/includes/Search/ResultsType.php
index 98e3f19..a990a4b 100644
--- a/includes/Search/ResultsType.php
+++ b/includes/Search/ResultsType.php
@@ -570,3 +570,48 @@
return [];
}
}
+
+class SingleAggResultsType implements ResultsType {
+ /** @var string Name of aggregation */
+ private $name;
+
+ /** @param string $name Name of aggregation to return */
+ public function __construct( $name ) {
+ $this->name = $name;
+ }
+
+ /**
+ * @return false|string|array corresponding to Elasticsearch source
filtering syntax
+ */
+ public function getSourceFiltering() {
+ return false;
+ }
+
+ public function getStoredFields() {
+ return [];
+ }
+
+ public function getHighlightingConfiguration( array $highlightSource ) {
+ return null;
+ }
+
+ /**
+ * @param SearchContext $context
+ * @param \Elastica\ResultSet $resultSet
+ * @return string[]
+ */
+ public function transformElasticsearchResult( SearchContext $context,
\Elastica\ResultSet $resultSet ) {
+ $aggs = $resultSet->getAggregations();
+ if ( isset( $aggs[$this->name] ) ) {
+ return $aggs[$this->name]['value'];
+ }
+ return [];
+ }
+
+ /**
+ * @return string[]
+ */
+ public function createEmptyResult() {
+ return [];
+ }
+}
diff --git a/includes/Search/SearchContext.php
b/includes/Search/SearchContext.php
index 59e244a..9b1b596 100644
--- a/includes/Search/SearchContext.php
+++ b/includes/Search/SearchContext.php
@@ -4,6 +4,7 @@
use CirrusSearch\OtherIndexes;
use CirrusSearch\SearchConfig;
+use Elastica\Aggregation\AbstractAggregation;
use Elastica\Query\AbstractQuery;
/**
@@ -203,6 +204,11 @@
* @var ResultsType Type of the result for the context.
*/
private $resultsType;
+
+ /**
+ * @var AbstractAggregation Aggregations to perform
+ */
+ private $aggs = [];
/**
* @param SearchConfig $config
@@ -873,4 +879,13 @@
$this->phraseRescoreQuery = $phraseRescoreQuery;
$this->isDirty = true;
}
+
+ public function addAggregation( AbstractAggregation $agg ) {
+ $this->aggs[] = $agg;
+ $this->isDirty = true;
+ }
+
+ public function getAggregations() {
+ return $this->aggs;
+ }
}
diff --git a/includes/Search/SearchRequestBuilder.php
b/includes/Search/SearchRequestBuilder.php
index 8e69378..aaadc6a 100644
--- a/includes/Search/SearchRequestBuilder.php
+++ b/includes/Search/SearchRequestBuilder.php
@@ -69,6 +69,10 @@
$query->setQuery( $this->searchContext->getQuery() );
+ foreach ( $this->searchContext->getAggregations() as $agg ) {
+ $query->addAggregation( $agg );
+ }
+
$highlight = $this->searchContext->getHighlight( $resultsType );
if ( $highlight ) {
$query->setHighlight( $highlight );
diff --git a/includes/Searcher.php b/includes/Searcher.php
index 20a28e7..3c1752c 100644
--- a/includes/Searcher.php
+++ b/includes/Searcher.php
@@ -2,6 +2,7 @@
namespace CirrusSearch;
+use CirrusSearch\Query\CountContentWordsBuilder;
use CirrusSearch\Query\NearMatchQueryBuilder;
use CirrusSearch\Query\PrefixSearchQueryBuilder;
use CirrusSearch\Query\SimpleKeywordFeature;
@@ -220,6 +221,16 @@
}
/**
+ * Perform a sum over the number of words in the content index
+ * @return Status status containing a single integer
+ */
+ public function countContentWords() {
+ ( new CountContentWordsBuilder() )->build( $this->searchContext
);
+ $this->limit = 1;
+ return $this->searchOne();
+ }
+
+ /**
* Perform a prefix search.
* @param string $term text by which to search
* @param string[] $variants variants to search for
--
To view, visit https://gerrit.wikimedia.org/r/392471
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I847f696405b447ab04972ad0215c09d0012c2098
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: EBernhardson <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits