jenkins-bot has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/372197 )
Change subject: Add Token Filter to Remove Commas from Chinese Indexes
......................................................................
Add Token Filter to Remove Commas from Chinese Indexes
add "smartcn_stop" filter to remove punctuation (all indexed as commas)
from text and plain indexes
update tests
Bug: T172653
Change-Id: I3761262ad7d22b4344c5630fd937b0eba7bf0a71
---
M includes/Maintenance/AnalysisConfigBuilder.php
M tests/unit/fixtures/languageAnalysis/zh.expected
2 files changed, 18 insertions(+), 1 deletion(-)
Approvals:
EBernhardson: Looks good to me, approved
jenkins-bot: Verified
diff --git a/includes/Maintenance/AnalysisConfigBuilder.php
b/includes/Maintenance/AnalysisConfigBuilder.php
index 36d7ffa..510dfed 100644
--- a/includes/Maintenance/AnalysisConfigBuilder.php
+++ b/includes/Maintenance/AnalysisConfigBuilder.php
@@ -605,14 +605,21 @@
'keep_both' => false,
'convert_type' => 't2s',
];
+ $config[ 'filter' ][ 'smartcn_stop' ] = [
+ // SmartCN converts lots of punctuation to ","
but we don't want to index it
+ 'type' => 'stop',
+ 'stopwords' => [ "," ],
+ ];
$config[ 'analyzer' ][ 'text' ] = [
'type' => 'custom',
'tokenizer' => 'smartcn_tokenizer',
'char_filter' => [ 'stconvertfix', 'tsconvert'
],
- 'filter' => [ 'lowercase' ],
+ 'filter' => [ 'smartcn_stop', 'lowercase' ],
];
$config[ 'analyzer' ][ 'text_search' ] = $config[
'analyzer' ][ 'text' ];
+ $config[ 'analyzer' ][ 'plain' ][ 'filter' ] = [
'smartcn_stop', 'lowercase' ];
+ $config[ 'analyzer' ][ 'plain_search' ][ 'filter' ] =
$config[ 'analyzer' ][ 'plain' ][ 'filter' ];
break;
case 'english':
$config[ 'filter' ][ 'possessive_english' ] = [
diff --git a/tests/unit/fixtures/languageAnalysis/zh.expected
b/tests/unit/fixtures/languageAnalysis/zh.expected
index cd4aeb1..a6008c7 100644
--- a/tests/unit/fixtures/languageAnalysis/zh.expected
+++ b/tests/unit/fixtures/languageAnalysis/zh.expected
@@ -8,6 +8,7 @@
"tsconvert"
],
"filter": [
+ "smartcn_stop",
"lowercase"
]
},
@@ -19,6 +20,7 @@
"tsconvert"
],
"filter": [
+ "smartcn_stop",
"lowercase"
]
},
@@ -26,6 +28,7 @@
"type": "custom",
"tokenizer": "standard",
"filter": [
+ "smartcn_stop",
"lowercase"
],
"char_filter": [
@@ -36,6 +39,7 @@
"type": "custom",
"tokenizer": "standard",
"filter": [
+ "smartcn_stop",
"lowercase"
],
"char_filter": [
@@ -209,6 +213,12 @@
"type": "truncate",
"length": 5000
},
+ "smartcn_stop": {
+ "type": "stop",
+ "stopwords": [
+ ","
+ ]
+ },
"dedup_asciifolding": {
"type": "unique",
"only_on_same_position": true
--
To view, visit https://gerrit.wikimedia.org/r/372197
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I3761262ad7d22b4344c5630fd937b0eba7bf0a71
Gerrit-PatchSet: 3
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Tjones <[email protected]>
Gerrit-Reviewer: Cindy-the-browser-test-bot <[email protected]>
Gerrit-Reviewer: DCausse <[email protected]>
Gerrit-Reviewer: EBernhardson <[email protected]>
Gerrit-Reviewer: Gehel <[email protected]>
Gerrit-Reviewer: Smalyshev <[email protected]>
Gerrit-Reviewer: Tjones <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits