DCausse has uploaded a new change for review. https://gerrit.wikimedia.org/r/280245
Change subject: CirrusSearch: Add new rescore profiles ...................................................................... CirrusSearch: Add new rescore profiles In order to run optimization plans we need these rescore profiles enabled in production. This should be a temporary solution, we will run various optimization with large samples to evaluate if any of these new functions are worth the effort. These profiles are also suited for A/B tests: all the values can be overridden by a request param. I'm note sure that wmf-config is the best place to store these profiles. They are particularly verbose and will certainly require fine-tuning per wiki. I wonder if a datastore (RESTBase?) might be more appropriate, this would allow us to do live optimizations without deploying anything... NOTE: depends on 1.27.0-wmf.19 Bug: T127896 Change-Id: I17aab1177662a5dad2c513ad1b406f3fd155ecc4 --- M wmf-config/CirrusSearch-common.php 1 file changed, 243 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/mediawiki-config refs/changes/45/280245/1 diff --git a/wmf-config/CirrusSearch-common.php b/wmf-config/CirrusSearch-common.php index b7ff669..1c51f5e 100644 --- a/wmf-config/CirrusSearch-common.php +++ b/wmf-config/CirrusSearch-common.php @@ -1,4 +1,6 @@ <?php +/* vim: set sw=4 ts=4 noet foldmarker=@{,@} foldmethod=marker: */ + # WARNING: This file is publically viewable on the web. Do not put private data here. # This file hold the CirrusSearch configuration which is common to all realms, @@ -166,6 +168,247 @@ // Configure ICU Folding $wgCirrusSearchUseIcuFolding = $wmgCirrusSearchUseIcuFolding; +// List of extra rescore profiles @{cirrus extra rescore profiles +// These profiles are needed to run optimization plans with large +// sample of queries in production. +$wgCirrusSearchRescoreProfiles += array( + 'geomean_log' => array( + 'supported_namespaces' => 'content', + 'fallback_profile' => 'default', + 'rescore' => array( + array( + 'window' => 8192, + 'window_size_override' => 'CirrusSearchFunctionRescoreWindowSize', + 'type' => 'function_score', + 'function_chain' => 'geomean_log', + 'query_weight' => 1.0, + 'rescore_query_weight' => 1.0, + 'score_mode' => 'multiply', + ), + array( + 'window' => 8192, + 'window_size_override' => 'CirrusSearchFunctionRescoreWindowSize', + 'type' => 'function_score', + 'function_chain' => 'optional_chain', + 'score_mode' => 'multiply', + ), + ), + ), + 'geomean_satu' => array( + 'supported_namespaces' => 'content', + 'fallback_profile' => 'default', + 'rescore' => array( + array( + 'window' => 8192, + 'window_size_override' => 'CirrusSearchFunctionRescoreWindowSize', + 'type' => 'function_score', + 'function_chain' => 'geomean_satu', + 'query_weight' => 1.0, + 'rescore_query_weight' => 1.0, + 'score_mode' => 'multiply', + ), + array( + 'window' => 8192, + 'window_size_override' => 'CirrusSearchFunctionRescoreWindowSize', + 'type' => 'function_score', + 'function_chain' => 'optional_chain', + 'score_mode' => 'multiply', + ), + ), + ), +); + + +$wgCirrusSearchRescoreFunctionScoreChains += array( + // GeoMean with logscale_boost + 'geomean_log' => array( + 'functions' => array( + array( + 'type' => 'geomean', + 'params' => array( + 'impact' => array( + 'uri_param_override' => 'cirrusGeoMeanLogImpact', + 'config_override' => 'CirrusSearchGeoMeanLogImpact', + 'value' => 1, + ), + 'members' => array( + array( + 'weight' => array( + 'uri_param_override' => 'cirrusBoostLinksWeight', + 'config_override' => 'CirrusSearchBoostLinksWeight', + 'value' => 1, + ), + 'type' => 'logscale_boost', + 'params' => array( + 'field' => 'incoming_links', + 'scale' => array( + 'value' => 500000, + 'uri_param_override' => 'cirrusBoostLinksScale', + 'config_override' => 'CirrusSearchBoostLinksScale', + ), + 'midpoint' => array( + 'value' => 1000, + 'uri_param_override' => 'cirrusBoostLinksCenter', + 'config_override' => 'CirrusSearchBoostLinksCenter', + ), + ), + ), + array( + 'weight' => array( + 'uri_param_override' => 'cirrusPopScoreWeight', + 'config_override' => 'CirrusSearchPopScoreWeight', + 'value' => 0, + ), + 'type' => 'logscale_boost', + 'params' => array( + 'field' => 'popularity_score', + 'scale' => array( + 'value' => 0.0001, + 'uri_param_override' => 'cirrusPopScoreScale', + 'config_override' => 'CirrusSearchPopScoreScale', + ), + 'midpoint' => array( + 'value' => 0.000007, + 'uri_param_override' => 'cirrusPopScoreCenter', + 'config_override' => 'CirrusSearchPopScoreCenter', + ), + ), + ), + array( + 'weight' => array( + 'uri_param_override' => 'cirrusBoostSizeWeight', + 'config_override' => 'CirrusSearchBoostSizeWeight', + 'value' => 0, + ), + 'type' => 'logscale_boost', + 'params' => array( + 'field' => 'text.word_count', + 'scale' => array( + 'value' => 30000, + 'uri_param_override' => 'cirrusBoostSizeScale', + 'config_override' => 'CirrusSearchBoostSizeScale', + ), + 'midpoint' => array( + 'value' => 350, + 'uri_param_override' => 'cirrusBoostSizeCenter', + 'config_override' => 'CirrusSearchBoostSizeCenter', + ), + ), + ), + ), + ), + ), + ), + ), + // GeoMean with saturation function + 'geomean_satu' => array( + 'functions' => array( + array( + 'type' => 'geomean', + 'params' => array( + 'impact' => array( + 'uri_param_override' => 'cirrusGeoMeanSatuImpact', + 'config_override' => 'CirrusSearchGeoMeanSatuImpact', + 'value' => 0.5, + ), + 'members' => array( + array( + 'weight' => array( + 'uri_param_override' => 'cirrusBoostLinksWeight', + 'config_override' => 'CirrusSearchBoostLinksWeight', + 'value' => 1, + ), + 'type' => 'satu', + 'params' => array( + 'field' => 'incoming_links', + 'k' => array( + 'value' => 300, + 'uri_param_override' => 'cirrusBoostLinksK', + 'config_override' => 'CirrusSearchBoostLinksK', + ), + 'a' => array( + 'value' => 250, + 'uri_param_override' => 'cirrusBoostLinksA', + 'config_override' => 'CirrusSearchBoostLinksA', + ), + ), + ), + array( + 'weight' => array( + 'uri_param_override' => 'cirrusPopScoreWeight', + 'config_override' => 'CirrusSearchPopScoreWeight', + 'value' => 0, + ), + 'type' => 'satu', + 'params' => array( + 'field' => 'popularity_score', + 'k' => array( + 'value' => 0.000007, + 'uri_param_override' => 'cirrusPopScoreK', + 'config_override' => 'CirrusSearchPopScoreK', + ), + 'a' => array( + 'value' => 250, + 'uri_param_override' => 'cirrusPopScoreA', + 'config_override' => 'CirrusSearchPopScoreA', + ), + ), + ), + array( + 'weight' => array( + 'uri_param_override' => 'cirrusBoostSizeWeight', + 'config_override' => 'CirrusSearchBoostSizeWeight', + 'value' => 0, + ), + 'type' => 'satu', + 'params' => array( + 'field' => 'text.word_count', + 'k' => array( + 'value' => 300, + 'uri_param_override' => 'cirrusBoostSizeK', + 'config_override' => 'CirrusSearchBoostSizeK', + ), + 'a' => array( + 'value' => 1, + 'uri_param_override' => 'cirrusBoostSizeA', + 'config_override' => 'CirrusSearchBoostSizeA', + ), + ), + ), + ), + ), + ), + ), + ), +); + +# TODO: move to InitialiseSettings.php if this technique is proven usefull and +# once we have an optimized value for all wikis. (Default values are an +# approximation for enwiki) +# (All values need to be overridden here for runSearch to work) +$wgCirrusSearchGeoMeanLogImpact = 1; +$wgCirrusSearchGeoMeanSatuImpact = 0.5; +$wgCirrusSearchBoostLinksWeight = 1; +$wgCirrusSearchPopScoreWeight = 0; +$wgCirrusSearchBoostSizeWeight = 0; + +$wgCirrusSearchBoostLinksScale = 500000; +$wgCirrusSearchBoostLinksCenter = 1000; +$wgCirrusSearchPopScoreScale = 0.0001; +$wgCirrusSearchPopScoreCenter = 0.000003; +$wgCirrusSearchBoostSizeScale = 30000; +$wgCirrusSearchBoostSizeCenter = 350; + +$wgCirrusSearchBoostLinksK = 1000; +$wgCirrusSearchPopScoreK = 0.000007; +$wgCirrusSearchBoostSizeK = 350; +$wgCirrusSearchBoostLinksA = 1; +$wgCirrusSearchPopScoreA = 1; +$wgCirrusSearchBoostSizeA = 1; + +// @} end of cirrus extra rescore profiles + + # Load per realm specific configuration, either: # - CirrusSearch-labs.php # - CirrusSearch-production.php -- To view, visit https://gerrit.wikimedia.org/r/280245 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I17aab1177662a5dad2c513ad1b406f3fd155ecc4 Gerrit-PatchSet: 1 Gerrit-Project: operations/mediawiki-config Gerrit-Branch: master Gerrit-Owner: DCausse <dcau...@wikimedia.org> Gerrit-Reviewer: DCausse <dcau...@wikimedia.org> Gerrit-Reviewer: EBernhardson <ebernhard...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits