Manybubbles has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/160811

Change subject: Adds a cirrus-analyze api
......................................................................

Adds a cirrus-analyze api

The analyze api wraps 
http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/indices-analyze.html
and lets users fiddle a bit with the chosen analyzer.  It might be useful
for explanation purposes but the analyze api isn't powerful enough for any
real experimentation, unfortunately.

Change-Id: I461c50581417bd4bb125dc3940d08d2c425f3151
---
M CirrusSearch.php
A includes/Action/AbstractFormlessAction.php
A includes/Action/Analyze.php
R includes/Action/Dump.php
A includes/Api/Analyze.php
M includes/Maintenance/AnalysisConfigBuilder.php
M includes/Searcher.php
7 files changed, 250 insertions(+), 27 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/11/160811/1

diff --git a/CirrusSearch.php b/CirrusSearch.php
index 08a63fb..86c98e7 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -500,6 +500,7 @@
 
 
 $includes = __DIR__ . "/includes/";
+$actionDir = $includes . 'Action/';
 $apiDir = $includes . 'Api/';
 $buildDocument = $includes . 'BuildDocument/';
 $jobsDir = $includes . 'Job/';
@@ -511,6 +512,10 @@
  * Classes
  */
 $wgAutoloadClasses['CirrusSearch'] = $includes . 'CirrusSearch.php';
+$wgAutoloadClasses['CirrusSearch\Action\AbstractFormlessAction'] = $actionDir 
. 'AbstractFormlessAction.php';
+$wgAutoloadClasses['CirrusSearch\Action\Analyze'] = $actionDir . 'Analyze.php';
+$wgAutoloadClasses['CirrusSearch\Action\Dump'] = $actionDir . 'Dump.php';
+$wgAutoloadClasses['CirrusSearch\Api\Analyze'] = $apiDir . 'Analyze.php';
 $wgAutoloadClasses['CirrusSearch\Api\ConfigDump'] = $apiDir . 'ConfigDump.php';
 $wgAutoloadClasses['CirrusSearch\Api\MappingDump'] = $apiDir . 
'MappingDump.php';
 $wgAutoloadClasses['CirrusSearch\Api\SettingsDump'] = $apiDir . 
'SettingsDump.php';
@@ -521,7 +526,6 @@
 $wgAutoloadClasses['CirrusSearch\BuildDocument\ParseBuilder'] = $buildDocument 
. 'Builder.php';
 $wgAutoloadClasses['CirrusSearch\BuildDocument\RedirectsAndIncomingLinks'] = 
$buildDocument . 'RedirectsAndIncomingLinks.php';
 $wgAutoloadClasses['CirrusSearch\Connection'] = $includes . 'Connection.php';
-$wgAutoloadClasses['CirrusSearch\Dump'] = $includes . 'Dump.php';
 $wgAutoloadClasses['CirrusSearch\ElasticsearchIntermediary'] = $includes . 
'ElasticsearchIntermediary.php';
 $wgAutoloadClasses['CirrusSearch\ForceSearchIndex'] = __DIR__ . 
'/maintenance/forceSearchIndex.php';
 $wgAutoloadClasses['CirrusSearch\Hooks'] = $includes . 'Hooks.php';
@@ -601,11 +605,13 @@
 /**
  * Actions
  */
-$wgActions[ 'cirrusdump' ] = 'CirrusSearch\Dump';
+$wgActions[ 'cirrusdump' ] = 'CirrusSearch\Action\Dump';
+$wgActions[ 'cirrusanalyze' ] = 'CirrusSearch\Action\Analyze';
 
 /**
  * API
  */
+$wgAPIModules['cirrus-analyze'] = 'CirrusSearch\Api\Analyze';
 $wgAPIModules['cirrus-config-dump'] = 'CirrusSearch\Api\ConfigDump';
 $wgAPIModules['cirrus-mapping-dump'] = 'CirrusSearch\Api\MappingDump';
 $wgAPIModules['cirrus-settings-dump'] = 'CirrusSearch\Api\SettingsDump';
diff --git a/includes/Action/AbstractFormlessAction.php 
b/includes/Action/AbstractFormlessAction.php
new file mode 100644
index 0000000..4926aa5
--- /dev/null
+++ b/includes/Action/AbstractFormlessAction.php
@@ -0,0 +1,48 @@
+<?php
+
+namespace CirrusSearch\Action;
+
+use \CirrusSearch\Searcher;
+use \FormlessAction;
+
+/**
+ * Base class for Cirrus formless actions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ */
+abstract class AbstractFormlessAction extends FormlessAction {
+       public abstract function result();
+
+       public function onView() {
+               // Disable regular results
+               $this->getOutput()->disable();
+
+               $response = $this->getRequest()->response();
+               $response->header( 'Content-type: application/json; 
charset=UTF-8' );
+
+               echo json_encode( $this->result() );
+
+               return null;
+       }
+
+       public function requiresWrite() {
+               return false;
+       }
+
+       public function requiresUnblock() {
+               return false;
+       }
+}
diff --git a/includes/Action/Analyze.php b/includes/Action/Analyze.php
new file mode 100644
index 0000000..5ed42a8
--- /dev/null
+++ b/includes/Action/Analyze.php
@@ -0,0 +1,55 @@
+<?php
+
+namespace CirrusSearch\Action;
+
+use \CirrusSearch\Searcher;
+use \FormlessAction;
+
+/**
+ * action=cirrusanalyze handler.  Analyzes the page text into tokens.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ */
+class Analyze extends AbstractFormlessAction {
+       public function result() {
+               $searcher = new Searcher( 0, 0, false, $this->getUser() );
+               $id = $this->getTitle()->getArticleID();
+               $esSources = $searcher->get( array( $id ), true );
+               if ( !$esSources->isOk() ) {
+                       // Exception has been logged
+                       return array();
+               }
+               $esSources = $esSources->getValue();
+               if ( !isset( $esSources[ 0 ] ) ) {
+                       return array();
+               }
+               $source = $esSources[ 0 ];
+               if ( !isset( $source->text ) ) {
+                       return array();
+               }
+               $text = $source->text;
+               $analyzed = $searcher->analyze( $text, array( 'analyzer' => 
'text' ) );
+               if ( !$analyzed->isOk() ) {
+                       // Exception has been logged
+                       return array();                 
+               }
+               return $analyzed->getValue();
+       }
+
+       public function getName() {
+               return 'cirrusanalyze';
+       }
+}
diff --git a/includes/Dump.php b/includes/Action/Dump.php
similarity index 73%
rename from includes/Dump.php
rename to includes/Action/Dump.php
index 58d5b65..16df6d2 100644
--- a/includes/Dump.php
+++ b/includes/Action/Dump.php
@@ -1,11 +1,12 @@
 <?php
 
-namespace CirrusSearch;
+namespace CirrusSearch\Action;
 
+use \CirrusSearch\Searcher;
 use \FormlessAction;
 
 /**
- * action=cirrusDump handler.  Dumps contents of Elasticsearch indexes for the
+ * action=cirrusdump handler.  Dumps contents of Elasticsearch indexes for the
  * page.
  *
  * This program is free software; you can redistribute it and/or modify
@@ -23,21 +24,14 @@
  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  * http://www.gnu.org/copyleft/gpl.html
  */
-class Dump extends FormlessAction {
-       public function onView() {
-               // Disable regular results
-               $this->getOutput()->disable();
-
-               $response = $this->getRequest()->response();
-               $response->header( 'Content-type: application/json; 
charset=UTF-8' );
-
+class Dump extends AbstractFormlessAction {
+       public function result() {
                $searcher = new Searcher( 0, 0, false, $this->getUser() );
                $id = $this->getTitle()->getArticleID();
                $esSources = $searcher->get( array( $id ), true );
                if ( !$esSources->isOk() ) {
                        // Exception has been logged
-                       echo '{}';
-                       return null;
+                       return array();
                }
                $esSources = $esSources->getValue();
 
@@ -51,20 +45,10 @@
                                '_source' => $esSource->getData(),
                        );
                }
-               echo json_encode( $result );
-
-               return null;
+               return $result;
        }
 
        public function getName() {
                return 'cirrusdump';
-       }
-
-       public function requiresWrite() {
-               return false;
-       }
-
-       public function requiresUnblock() {
-               return false;
        }
 }
diff --git a/includes/Api/Analyze.php b/includes/Api/Analyze.php
new file mode 100644
index 0000000..d806fb1
--- /dev/null
+++ b/includes/Api/Analyze.php
@@ -0,0 +1,97 @@
+<?php
+
+
+namespace CirrusSearch\Api;
+use \ApiBase;
+use \CirrusSearch\Searcher;
+
+/**
+ * Analyzes a string using some parameters.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ */
+class Analyze extends ApiBase {
+       public function execute() {
+               $params = $this->extractRequestParams();
+               $searcher = new Searcher( 0, 0, false, $this->getUser() );
+               $args = array();
+               if ( isset( $params[ 'analyzer' ] ) ) {
+                       $args[ 'analyzer' ] = $params[ 'analyzer' ];
+               } else {
+                       $args[ 'tokenizer' ] = $params[ 'tokenizer' ];
+                       if ( isset( $params[ 'tokenfilters' ] ) ) {
+                               $args[ 'token_filters' ] = implode( ',', 
$params[ 'tokenfilters' ] );
+                       }
+                       if ( isset( $params[ 'charfilters' ] ) ) {
+                               $args[ 'char_filters' ] = implode( ',', 
$params[ 'charfilters' ] );
+                       }
+               }
+               
+               $analyzed = $searcher->analyze( $params[ 'text' ], $args );
+               if ( !$analyzed->isOk() ) {
+                       // Exception has been logged
+                       $this->dieStatus( $analyzed );
+               }
+               $result = array();
+               foreach ( $analyzed->getValue() as $value ) {
+                       $result[] = $value;
+               }
+               $this->getResult()->setIndexedTagName( $result, 'tokens' );
+               $this->getResult()->addValue( null, 'tokens', $result );
+       }
+
+       public function getAllowedParams() {
+               return array(
+                       'text' => array(
+                               ApiBase::PARAM_TYPE => 'string',
+                               ApiBase::PARAM_REQUIRED => true
+                       ),
+                       'analyzer' => array(
+                               ApiBase::PARAM_TYPE => array(
+                                       'text', 'plain', 'near_match',
+                                       'suggest', 'prefix', 'word_prefix',
+                                       'lowercase_keyword'
+                               ),
+                       ),
+                       'tokenizer' => array(
+                               ApiBase::PARAM_TYPE => 'string',
+                               ApiBase::PARAM_DFLT => 'standard',
+                       ),
+                       'tokenfilters' => array(
+                               ApiBase::PARAM_TYPE => 'string',
+                               ApiBase::PARAM_ISMULTI => true,
+                       ),
+                       'charfilters' => array(
+                               ApiBase::PARAM_TYPE => 'string',
+                               ApiBase::PARAM_ISMULTI => true,
+                       ),
+               );
+       }
+
+       public function getParamDescription() {
+               return array(
+                       'text' => 'Text to analyze',
+                       'analyzer' => 'Named analyzer (overrides tokenizer, 
tokenfilters, charfilters if set)',
+                       'tokenizer' => 'Tokenizer doing analysis',
+                       'tokenfilters' => 'Filters applied to tokens',
+                       'charfilters' => 'Filters applied to text before 
tokenizer',
+               );
+       }
+
+       public function getDescription() {
+               return 'Analyze a string using Elasticsearch.';
+       }
+}
diff --git a/includes/Maintenance/AnalysisConfigBuilder.php 
b/includes/Maintenance/AnalysisConfigBuilder.php
index 1d17c4f..a36fd37 100644
--- a/includes/Maintenance/AnalysisConfigBuilder.php
+++ b/includes/Maintenance/AnalysisConfigBuilder.php
@@ -162,7 +162,7 @@
                                ),
                        ),
                        'char_filter' => array(
-                               // Flattens things that are space like to 
spaces in the near_match style analyzersc
+                               // Flattens things that are space like to 
spaces in the near_match style analyzers
                                'near_space_flattener' => array(
                                        'type' => 'mapping',
                                        'mappings' => array(
diff --git a/includes/Searcher.php b/includes/Searcher.php
index bd78012..fdc636b 100644
--- a/includes/Searcher.php
+++ b/includes/Searcher.php
@@ -781,7 +781,7 @@
                        'doWork' => function() use ( $searcher, $pageIds, 
$sourceFiltering, $indexType, $indexBaseName ) {
                                try {
                                        global 
$wgCirrusSearchClientSideSearchTimeout;
-                                       $searcher->start( "get of $indexType." 
. implode( ', ', $pageIds ) );
+                                       $searcher->start( "get of " . implode( 
', ', $pageIds ) );
                                        // Shard timeout not supported on get 
requests so we just use the client side timeout
                                        Connection::setTimeout( 
$wgCirrusSearchClientSideSearchTimeout[ 'default' ] );
                                        $pageType = Connection::getPageType( 
$indexBaseName, $indexType );
@@ -806,6 +806,39 @@
                return $getWork->execute();
        }
 
+       /**
+        * Send $text to elasticsearch for analysis with the analysis arguments 
from $args.
+        * @param string $text text to analyze
+        * @param array $args analysis arguments
+        * @return array result of the analysis
+        */
+       public function analyze( $text, $args ) {
+               global $wgCirrusSearchPoolCounterKey;
+
+               $searcher = $this;
+               $getWork = new PoolCounterWorkViaCallback( 
'CirrusSearch-Analyze', $wgCirrusSearchPoolCounterKey, array(
+                       'doWork' => function() use ( $searcher, $text, $args ) {
+                               global $wgCirrusSearchClientSideSearchTimeout;
+
+                               try {
+                                       $searcher->start( "analyzing" );
+                                       // Shard timeout not supported on get 
requests so we just use the client side timeout
+                                       Connection::setTimeout( 
$wgCirrusSearchClientSideSearchTimeout[ 'default' ] );
+                                       $index = Connection::getIndex( 
wfWikiId(), 'general' );
+                                       return $searcher->success( 
$index->analyze( $text, $args ) );
+                               } catch ( 
\Elastica\Exception\ExceptionInterface $e ) {
+                                       return $searcher->failure( $e );
+                               }
+                       },
+                       'error' => function( $status ) {
+                               $status = $status->getErrorsArray();
+                               wfLogWarning( 'Pool error performing an analyze 
against Elasticsearch:  ' . $status[ 0 ][ 0 ] );
+                               return Status::newFatal( 
'cirrussearch-backend-error' );
+                       }
+               ) );
+               return $getWork->execute();
+       }
+
        private function extractSpecialSyntaxFromTerm( $regex, $callback ) {
                $suggestPrefixes = $this->suggestPrefixes;
                $this->term = preg_replace_callback( $regex,

-- 
To view, visit https://gerrit.wikimedia.org/r/160811
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I461c50581417bd4bb125dc3940d08d2c425f3151
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to