DCausse has submitted this change and it was merged.
Change subject: Include information about search results in
CirrusSearchRequestSet
......................................................................
Include information about search results in CirrusSearchRequestSet
Adds CirrusSearchHit to the schema at both top level and per-request.
This represents what was returned to the user and at the per-request
level to identify what was returned from elasticsearch.
The .avsc file is buiilt from the idl. The idl format is much easier
to maintain from a human perspective. The .avsc is built as follows, the
avro-tools jar is a standard artifact built as part of apache/avro.
java -jar avro-tools-1.7.7.jar idl2schemata CirrusSearchRequestSet.idl
/tmp/csrq
mv /tmp/csrq/CirrusSearchRequestSet.avsc 123456789.avsc
TODO: Would be best if jenkins could build the .avsc file for us
Bug: T128533
Change-Id: I1f194cc318c18e7c292c5450ea011811de8df204
---
A avro/mediawiki/CirrusSearchRequestSet/121456865906.avsc
A avro/mediawiki/CirrusSearchRequestSet/CirrusSearchRequestSet.idl
2 files changed, 282 insertions(+), 0 deletions(-)
Approvals:
DCausse: Looks good to me, approved
jenkins-bot: Verified
diff --git a/avro/mediawiki/CirrusSearchRequestSet/121456865906.avsc
b/avro/mediawiki/CirrusSearchRequestSet/121456865906.avsc
new file mode 100644
index 0000000..e8e0524
--- /dev/null
+++ b/avro/mediawiki/CirrusSearchRequestSet/121456865906.avsc
@@ -0,0 +1,198 @@
+{
+ "type" : "record",
+ "name" : "CirrusSearchRequestSet",
+ "namespace" : "org.wikimedia.analytics.schemas",
+ "doc" : "A set of requests made by CirrusSearch to the elasticsearch user
for a single php execution context",
+ "fields" : [ {
+ "name" : "id",
+ "type" : "string",
+ "doc" : "UUIDv4 id of this request set",
+ "default" : ""
+ }, {
+ "name" : "ts",
+ "type" : "int",
+ "doc" : "The timestamp, in unix time, that the request was made",
+ "default" : 0
+ }, {
+ "name" : "wikiId",
+ "type" : "string",
+ "doc" : "The wiki making this request, such as dewiki or enwiktionary",
+ "default" : ""
+ }, {
+ "name" : "source",
+ "type" : "string",
+ "doc" : "Where the request is coming from. Typically: web, api or cli",
+ "default" : ""
+ }, {
+ "name" : "identity",
+ "type" : "string",
+ "doc" : "A hash identifying the requestor. Includes the IP address and
User Agent when available",
+ "default" : ""
+ }, {
+ "name" : "ip",
+ "type" : "string",
+ "doc" : "The IP address (either ipv4 or ipv6) in string notation",
+ "default" : ""
+ }, {
+ "name" : "userAgent",
+ "type" : "string",
+ "doc" : "The HTTP User-Agent header, or null if not-applicable",
+ "default" : ""
+ }, {
+ "name" : "backendUserTests",
+ "type" : {
+ "type" : "array",
+ "items" : "string"
+ },
+ "doc" : "List of backend tests the requests are participating in",
+ "default" : [ ]
+ }, {
+ "name" : "tookMs",
+ "type" : "float",
+ "doc" : "Total time of the php request in milliseconds",
+ "default" : -1
+ }, {
+ "name" : "payload",
+ "type" : {
+ "type" : "map",
+ "values" : "string"
+ },
+ "doc" : "General purpose data for this request set",
+ "default" : { }
+ }, {
+ "name" : "hits",
+ "type" : {
+ "type" : "array",
+ "items" : {
+ "type" : "record",
+ "name" : "CirrusSearchHit",
+ "doc" : "An individual search result",
+ "fields" : [ {
+ "name" : "title",
+ "type" : "string",
+ "doc" : "MediaWiki page title of the result",
+ "default" : ""
+ }, {
+ "name" : "index",
+ "type" : "string",
+ "doc" : "ElasticSearch index this result came from",
+ "default" : ""
+ }, {
+ "name" : "pageId",
+ "type" : "int",
+ "default" : -1
+ }, {
+ "name" : "score",
+ "type" : "float",
+ "doc" : "Score from ElasticSearch for this result",
+ "default" : -1
+ }, {
+ "name" : "profileName",
+ "type" : "string",
+ "doc" : "The profile name for comp_suggest queries",
+ "default" : ""
+ } ]
+ }
+ },
+ "doc" : "Final set of result pages returned for the php request.",
+ "default" : [ ]
+ }, {
+ "name" : "requests",
+ "type" : {
+ "type" : "array",
+ "items" : {
+ "type" : "record",
+ "name" : "CirrusSearchRequest",
+ "doc" : "An individual request made between MediaWiki and
ElasticSearch",
+ "fields" : [ {
+ "name" : "query",
+ "type" : "string",
+ "doc" : "The actual search request",
+ "default" : ""
+ }, {
+ "name" : "queryType",
+ "type" : "string",
+ "default" : ""
+ }, {
+ "name" : "indices",
+ "type" : {
+ "type" : "array",
+ "items" : "string"
+ },
+ "default" : [ ]
+ }, {
+ "name" : "tookMs",
+ "type" : "int",
+ "doc" : "The number of milliseconds between passing the query to the
client library and getting the response back in the application",
+ "default" : -1
+ }, {
+ "name" : "elasticTookMs",
+ "type" : "int",
+ "doc" : "The number of milliseconds the query took, according to the
elasticsearch response",
+ "default" : -1
+ }, {
+ "name" : "limit",
+ "type" : "int",
+ "doc" : "The maximum number of results requested by the application",
+ "default" : -1
+ }, {
+ "name" : "hitsTotal",
+ "type" : "int",
+ "doc" : "The approximate total number of documents matching the
query",
+ "default" : -1
+ }, {
+ "name" : "hitsReturned",
+ "type" : "int",
+ "doc" : "The number of results returned to the application",
+ "default" : -1
+ }, {
+ "name" : "hitsOffset",
+ "type" : "int",
+ "doc" : "The offset of the query",
+ "default" : -1
+ }, {
+ "name" : "namespaces",
+ "type" : {
+ "type" : "array",
+ "items" : "int"
+ },
+ "doc" : "Each element is a mediawiki namespace id that was searched",
+ "default" : [ ]
+ }, {
+ "name" : "suggestion",
+ "type" : "string",
+ "doc" : "The suggestion generated by elasticsearch",
+ "default" : ""
+ }, {
+ "name" : "suggestionRequested",
+ "type" : "boolean",
+ "doc" : "If a suggestion was requested from elasticsearch",
+ "default" : false
+ }, {
+ "name" : "maxScore",
+ "type" : "float",
+ "doc" : "Max score returned by elasticsearch, this is the best score
in the\n * results before we apply rescore queries. Unfortunaltely we do
not know if\n * it's part or the result but it can give a rough idea of the
score range\n * before we apply the rescore queries.",
+ "default" : -1.0
+ }, {
+ "name" : "payload",
+ "type" : {
+ "type" : "map",
+ "values" : "string"
+ },
+ "doc" : "General purpose data for this request",
+ "default" : { }
+ }, {
+ "name" : "hits",
+ "type" : {
+ "type" : "array",
+ "items" : "CirrusSearchHit"
+ },
+ "doc" : "Final set of result pages returned for the ElasticSearch
request",
+ "default" : [ ]
+ } ]
+ }
+ },
+ "doc" : "A list of requests made between mediawiki and elasticsearch in a
single execution context",
+ "default" : [ ]
+ } ]
+}
diff --git a/avro/mediawiki/CirrusSearchRequestSet/CirrusSearchRequestSet.idl
b/avro/mediawiki/CirrusSearchRequestSet/CirrusSearchRequestSet.idl
new file mode 100644
index 0000000..28fcdd0
--- /dev/null
+++ b/avro/mediawiki/CirrusSearchRequestSet/CirrusSearchRequestSet.idl
@@ -0,0 +1,84 @@
+@namespace("org.wikimedia.analytics.schemas")
+protocol CirrusSearchLogging {
+
+ /** An individual search result */
+ record CirrusSearchHit {
+ /** MediaWiki page title of the result */
+ string title = "";
+ /** ElasticSearch index this result came from */
+ string index = "";
+ /* MediaWiki page id. May be -1 for interwiki results */
+ int pageId = -1;
+ /** Score from ElasticSearch for this result */
+ float score = -1;
+ /** The profile name for comp_suggest queries */
+ string profileName = "";
+ }
+
+ /** An individual request made between MediaWiki and ElasticSearch */
+ record CirrusSearchRequest {
+ /** The actual search request */
+ string query = "";
+ /* The general type of query performed, such as full_text, prefix, etc. */
+ string queryType = "";
+ /* The list of indices the request was performed against */
+ array<string> indices = [];
+ /** The number of milliseconds between passing the query to the client
library and getting the response back in the application */
+ int tookMs = -1;
+ /** The number of milliseconds the query took, according to the
elasticsearch response */
+ int elasticTookMs = -1;
+ /** The maximum number of results requested by the application */
+ int limit = -1;
+ /** The approximate total number of documents matching the query */
+ int hitsTotal = -1;
+ /** The number of results returned to the application */
+ int hitsReturned = -1;
+ /** The offset of the query */
+ int hitsOffset = -1;
+ /** Each element is a mediawiki namespace id that was searched */
+ array<int> namespaces = [];
+ /** The suggestion generated by elasticsearch */
+ string suggestion = "";
+ /** If a suggestion was requested from elasticsearch */
+ boolean suggestionRequested = false;
+ /** Max score returned by elasticsearch, this is the best score in the
+ * results before we apply rescore queries. Unfortunaltely we do not know
if
+ * it's part or the result but it can give a rough idea of the score range
+ * before we apply the rescore queries.
+ */
+ float maxScore = -1.0;
+ /** General purpose data for this request */
+ map<string> payload = {};
+ /** Final set of result pages returned for the ElasticSearch request */
+ array<CirrusSearchHit> hits = [];
+ }
+
+ /** A set of requests made by CirrusSearch to the elasticsearch user for a
single php execution context */
+ record CirrusSearchRequestSet {
+ /** UUIDv4 id of this request set */
+ string id = "";
+ /** The timestamp, in unix time, that the request was made */
+ int ts = 0;
+ /** The wiki making this request, such as dewiki or enwiktionary */
+ string wikiId = "";
+ /** Where the request is coming from. Typically: web, api or cli */
+ string source = "";
+ /** A hash identifying the requestor. Includes the IP address and User
Agent when available */
+ string identity = "";
+ /** The IP address (either ipv4 or ipv6) in string notation */
+ string ip = "";
+ /** The HTTP User-Agent header, or null if not-applicable */
+ string userAgent = "";
+ /** List of backend tests the requests are participating in */
+ array<string> backendUserTests = [];
+ /** Total time of the php request in milliseconds */
+ float tookMs = -1;
+ /** General purpose data for this request set */
+ map<string> payload = {};
+ /** Final set of result pages returned for the php request. */
+ array<CirrusSearchHit> hits = [];
+ /** A list of requests made between mediawiki and elasticsearch in a
single execution context */
+ array<CirrusSearchRequest> requests = [];
+ }
+
+}
--
To view, visit https://gerrit.wikimedia.org/r/274312
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I1f194cc318c18e7c292c5450ea011811de8df204
Gerrit-PatchSet: 4
Gerrit-Project: mediawiki/event-schemas
Gerrit-Branch: master
Gerrit-Owner: EBernhardson <[email protected]>
Gerrit-Reviewer: DCausse <[email protected]>
Gerrit-Reviewer: EBernhardson <[email protected]>
Gerrit-Reviewer: Gehel <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits