jenkins-bot has submitted this change and it was merged.

Change subject: Add detection limits for textcat
......................................................................


Add detection limits for textcat

Bug: T127338
Change-Id: I8d1c54d1b2e5f4ee6aebbff83664e3d07f12f0f0
---
M includes/CirrusSearch.php
M includes/LanguageDetector/TextCat.php
M tests/unit/LanguageDetectTest.php
3 files changed, 35 insertions(+), 14 deletions(-)

Approvals:
  Cindy-the-browser-test-bot: Looks good to me, but someone else must approve
  Tjones: Looks good to me, but someone else must approve
  DCausse: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/includes/CirrusSearch.php b/includes/CirrusSearch.php
index e1cf1ed..c628e46 100644
--- a/includes/CirrusSearch.php
+++ b/includes/CirrusSearch.php
@@ -97,6 +97,14 @@
        }
 
        /**
+        * Get search config
+        * @return SearchConfig
+        */
+       public function getConfig() {
+               return $this->config;
+       }
+
+       /**
         * Override supports to shut off updates to Cirrus via the SearchEngine 
infrastructure.  Page
         * updates and additions are chained on the end of the links update 
job.  Deletes are noticed
         * via the ArticleDeleteComplete hook.
diff --git a/includes/LanguageDetector/TextCat.php 
b/includes/LanguageDetector/TextCat.php
index ae84a8a..2772382 100644
--- a/includes/LanguageDetector/TextCat.php
+++ b/includes/LanguageDetector/TextCat.php
@@ -12,18 +12,24 @@
         * @see \CirrusSearch\LanguageDetector\Detector::detect()
         */
        public function detect( CirrusSearch $cirrus, $text ) {
-               global $wgCirrusSearchTextcatModel;
-               if( empty( $wgCirrusSearchTextcatModel ) ) {
+               $config = $cirrus->getConfig();
+               if( empty( $config ) ) {
+                       // Should not happen
                        return null;
                }
-               if( !is_dir( $wgCirrusSearchTextcatModel ) ) {
+               $dir = $config->getElement('CirrusSearchTextcatModel');
+               if( !$dir ) {
+                       return null;
+               }
+               if( !is_dir( $dir ) ) {
                        LoggerFactory::getInstance( 'CirrusSearch' )->warning(
                                "Bad directory for TextCat model: {dir}",
-                               array( "dir" => $wgCirrusSearchTextcatModel )
+                               array( "dir" => $dir )
                        );
                }
-               $textcat = new \TextCat( $wgCirrusSearchTextcatModel );
-               $languages = $textcat->classify( $text );
+
+               $textcat = new \TextCat( $dir );
+               $languages = $textcat->classify( $text, $config->getElement( 
'CirrusSearchTextcatLanguages' ) );
                if( !empty( $languages ) ) {
                        // For now, just return the best option
                        // TODO: thing what else we could do
diff --git a/tests/unit/LanguageDetectTest.php 
b/tests/unit/LanguageDetectTest.php
index dd4707f..d317e20 100644
--- a/tests/unit/LanguageDetectTest.php
+++ b/tests/unit/LanguageDetectTest.php
@@ -41,6 +41,11 @@
 
        public function setUp() {
                $this->cirrus = new \CirrusSearch();
+               global $wgCirrusSearchTextcatModel;
+               if (empty( $wgCirrusSearchTextcatModel ) ) {
+                       $tc = new \ReflectionClass('TextCat');
+                       $wgCirrusSearchTextcatModel = 
dirname($tc->getFileName())."/LM-query/";
+               }
        }
 
        /**
@@ -49,19 +54,21 @@
         * @param string $language
         */
        public function testTextCatDetector($text, $language) {
-               global $wgCirrusSearchTextcatModel;
-               if (empty($wgCirrusSearchTextcatModel)) {
-                       $tc = new \ReflectionClass( 'TextCat' );
-                       $wgCirrusSearchTextcatModel = dirname( 
$tc->getFileName() )."/LM-query/";
-               }
                // not really used for anything, but we need to pass it as a 
parameter
                $detector = new TextCat();
                $detect = $detector->detect($this->cirrus, $text);
                $this->assertEquals($language, $detect);
        }
 
-       public function getHttpLangs()
-       {
+       public function testTextCatDetectorLimited() {
+               global $wgCirrusSearchTextcatLanguages;
+               $wgCirrusSearchTextcatLanguages = array("en", "ru");
+               $detector = new TextCat();
+               $detect = $detector->detect($this->cirrus, "volviendose malo");
+               $this->assertEquals("en", $detect);
+       }
+
+       public function getHttpLangs() {
                return array(
                        array("en", array("en"), null),
                        array("en", array("en-UK", "en-US"), null),
@@ -76,7 +83,7 @@
        /**
         * @dataProvider getHttpLangs
         * @param string $content
-        * @param array $http
+        * @param array  $http
         * @param string $result
         */
        public function testHttpAcceptDetector($content, $http, $result) {

-- 
To view, visit https://gerrit.wikimedia.org/r/271719
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I8d1c54d1b2e5f4ee6aebbff83664e3d07f12f0f0
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Smalyshev <[email protected]>
Gerrit-Reviewer: Cindy-the-browser-test-bot <[email protected]>
Gerrit-Reviewer: DCausse <[email protected]>
Gerrit-Reviewer: EBernhardson <[email protected]>
Gerrit-Reviewer: Manybubbles <[email protected]>
Gerrit-Reviewer: Tjones <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to