jenkins-bot has submitted this change and it was merged.
Change subject: Add detection limits for textcat
......................................................................
Add detection limits for textcat
Bug: T127338
Change-Id: I8d1c54d1b2e5f4ee6aebbff83664e3d07f12f0f0
---
M includes/CirrusSearch.php
M includes/LanguageDetector/TextCat.php
M tests/unit/LanguageDetectTest.php
3 files changed, 35 insertions(+), 14 deletions(-)
Approvals:
Cindy-the-browser-test-bot: Looks good to me, but someone else must approve
Tjones: Looks good to me, but someone else must approve
DCausse: Looks good to me, approved
jenkins-bot: Verified
diff --git a/includes/CirrusSearch.php b/includes/CirrusSearch.php
index e1cf1ed..c628e46 100644
--- a/includes/CirrusSearch.php
+++ b/includes/CirrusSearch.php
@@ -97,6 +97,14 @@
}
/**
+ * Get search config
+ * @return SearchConfig
+ */
+ public function getConfig() {
+ return $this->config;
+ }
+
+ /**
* Override supports to shut off updates to Cirrus via the SearchEngine
infrastructure. Page
* updates and additions are chained on the end of the links update
job. Deletes are noticed
* via the ArticleDeleteComplete hook.
diff --git a/includes/LanguageDetector/TextCat.php
b/includes/LanguageDetector/TextCat.php
index ae84a8a..2772382 100644
--- a/includes/LanguageDetector/TextCat.php
+++ b/includes/LanguageDetector/TextCat.php
@@ -12,18 +12,24 @@
* @see \CirrusSearch\LanguageDetector\Detector::detect()
*/
public function detect( CirrusSearch $cirrus, $text ) {
- global $wgCirrusSearchTextcatModel;
- if( empty( $wgCirrusSearchTextcatModel ) ) {
+ $config = $cirrus->getConfig();
+ if( empty( $config ) ) {
+ // Should not happen
return null;
}
- if( !is_dir( $wgCirrusSearchTextcatModel ) ) {
+ $dir = $config->getElement('CirrusSearchTextcatModel');
+ if( !$dir ) {
+ return null;
+ }
+ if( !is_dir( $dir ) ) {
LoggerFactory::getInstance( 'CirrusSearch' )->warning(
"Bad directory for TextCat model: {dir}",
- array( "dir" => $wgCirrusSearchTextcatModel )
+ array( "dir" => $dir )
);
}
- $textcat = new \TextCat( $wgCirrusSearchTextcatModel );
- $languages = $textcat->classify( $text );
+
+ $textcat = new \TextCat( $dir );
+ $languages = $textcat->classify( $text, $config->getElement(
'CirrusSearchTextcatLanguages' ) );
if( !empty( $languages ) ) {
// For now, just return the best option
// TODO: thing what else we could do
diff --git a/tests/unit/LanguageDetectTest.php
b/tests/unit/LanguageDetectTest.php
index dd4707f..d317e20 100644
--- a/tests/unit/LanguageDetectTest.php
+++ b/tests/unit/LanguageDetectTest.php
@@ -41,6 +41,11 @@
public function setUp() {
$this->cirrus = new \CirrusSearch();
+ global $wgCirrusSearchTextcatModel;
+ if (empty( $wgCirrusSearchTextcatModel ) ) {
+ $tc = new \ReflectionClass('TextCat');
+ $wgCirrusSearchTextcatModel =
dirname($tc->getFileName())."/LM-query/";
+ }
}
/**
@@ -49,19 +54,21 @@
* @param string $language
*/
public function testTextCatDetector($text, $language) {
- global $wgCirrusSearchTextcatModel;
- if (empty($wgCirrusSearchTextcatModel)) {
- $tc = new \ReflectionClass( 'TextCat' );
- $wgCirrusSearchTextcatModel = dirname(
$tc->getFileName() )."/LM-query/";
- }
// not really used for anything, but we need to pass it as a
parameter
$detector = new TextCat();
$detect = $detector->detect($this->cirrus, $text);
$this->assertEquals($language, $detect);
}
- public function getHttpLangs()
- {
+ public function testTextCatDetectorLimited() {
+ global $wgCirrusSearchTextcatLanguages;
+ $wgCirrusSearchTextcatLanguages = array("en", "ru");
+ $detector = new TextCat();
+ $detect = $detector->detect($this->cirrus, "volviendose malo");
+ $this->assertEquals("en", $detect);
+ }
+
+ public function getHttpLangs() {
return array(
array("en", array("en"), null),
array("en", array("en-UK", "en-US"), null),
@@ -76,7 +83,7 @@
/**
* @dataProvider getHttpLangs
* @param string $content
- * @param array $http
+ * @param array $http
* @param string $result
*/
public function testHttpAcceptDetector($content, $http, $result) {
--
To view, visit https://gerrit.wikimedia.org/r/271719
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I8d1c54d1b2e5f4ee6aebbff83664e3d07f12f0f0
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Smalyshev <[email protected]>
Gerrit-Reviewer: Cindy-the-browser-test-bot <[email protected]>
Gerrit-Reviewer: DCausse <[email protected]>
Gerrit-Reviewer: EBernhardson <[email protected]>
Gerrit-Reviewer: Manybubbles <[email protected]>
Gerrit-Reviewer: Tjones <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits