Tjones has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/372197 )
Change subject: Add Token Filter to Remove Commas from Chinese Indexes
......................................................................
Add Token Filter to Remove Commas from Chinese Indexes
add "smartcn_stop" filter to remove punctuation (all indexed as commas)
from indexes
update tests
Bug: T172653
Change-Id: I3761262ad7d22b4344c5630fd937b0eba7bf0a71
---
M includes/Maintenance/AnalysisConfigBuilder.php
M tests/unit/fixtures/languageAnalysis/zh.expected
2 files changed, 14 insertions(+), 1 deletion(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch
refs/changes/97/372197/1
diff --git a/includes/Maintenance/AnalysisConfigBuilder.php
b/includes/Maintenance/AnalysisConfigBuilder.php
index 36d7ffa..3bc41db 100644
--- a/includes/Maintenance/AnalysisConfigBuilder.php
+++ b/includes/Maintenance/AnalysisConfigBuilder.php
@@ -605,11 +605,16 @@
'keep_both' => false,
'convert_type' => 't2s',
];
+ $config[ 'filter' ][ 'smartcn_stop' ] = [
+ // SmartCN converts lots of punctuation to ","
but we don't want to index it
+ 'type' => 'stop',
+ 'stopwords' => [","],
+ ];
$config[ 'analyzer' ][ 'text' ] = [
'type' => 'custom',
'tokenizer' => 'smartcn_tokenizer',
'char_filter' => [ 'stconvertfix', 'tsconvert'
],
- 'filter' => [ 'lowercase' ],
+ 'filter' => [ 'smartcn_stop', 'lowercase' ],
];
$config[ 'analyzer' ][ 'text_search' ] = $config[
'analyzer' ][ 'text' ];
diff --git a/tests/unit/fixtures/languageAnalysis/zh.expected
b/tests/unit/fixtures/languageAnalysis/zh.expected
index cd4aeb1..b2f22af 100644
--- a/tests/unit/fixtures/languageAnalysis/zh.expected
+++ b/tests/unit/fixtures/languageAnalysis/zh.expected
@@ -8,6 +8,7 @@
"tsconvert"
],
"filter": [
+ "smartcn_stop",
"lowercase"
]
},
@@ -19,6 +20,7 @@
"tsconvert"
],
"filter": [
+ "smartcn_stop",
"lowercase"
]
},
@@ -209,6 +211,12 @@
"type": "truncate",
"length": 5000
},
+ "smartcn_stop": {
+ "type": "stop",
+ "stopwords": [
+ ","
+ ]
+ },
"dedup_asciifolding": {
"type": "unique",
"only_on_same_position": true
--
To view, visit https://gerrit.wikimedia.org/r/372197
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I3761262ad7d22b4344c5630fd937b0eba7bf0a71
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Tjones <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits