DCausse has submitted this change and it was merged.

Change subject: Include information about search results in 
CirrusSearchRequestSet
......................................................................


Include information about search results in CirrusSearchRequestSet

Adds CirrusSearchHit to the schema at both top level and per-request.
This represents what was returned to the user and at the per-request
level to identify what was returned from elasticsearch.

The .avsc file is buiilt from the idl. The idl format is much easier
to maintain from a human perspective. The .avsc is built as follows, the
avro-tools jar is a standard artifact built as part of apache/avro.

  java -jar avro-tools-1.7.7.jar idl2schemata CirrusSearchRequestSet.idl 
/tmp/csrq
  mv /tmp/csrq/CirrusSearchRequestSet.avsc 123456789.avsc

TODO: Would be best if jenkins could build the .avsc file for us

Bug: T128533
Change-Id: I1f194cc318c18e7c292c5450ea011811de8df204
---
A avro/mediawiki/CirrusSearchRequestSet/121456865906.avsc
A avro/mediawiki/CirrusSearchRequestSet/CirrusSearchRequestSet.idl
2 files changed, 282 insertions(+), 0 deletions(-)

Approvals:
  DCausse: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/avro/mediawiki/CirrusSearchRequestSet/121456865906.avsc 
b/avro/mediawiki/CirrusSearchRequestSet/121456865906.avsc
new file mode 100644
index 0000000..e8e0524
--- /dev/null
+++ b/avro/mediawiki/CirrusSearchRequestSet/121456865906.avsc
@@ -0,0 +1,198 @@
+{
+  "type" : "record",
+  "name" : "CirrusSearchRequestSet",
+  "namespace" : "org.wikimedia.analytics.schemas",
+  "doc" : "A set of requests made by CirrusSearch to the elasticsearch user 
for a single php execution context",
+  "fields" : [ {
+    "name" : "id",
+    "type" : "string",
+    "doc" : "UUIDv4 id of this request set",
+    "default" : ""
+  }, {
+    "name" : "ts",
+    "type" : "int",
+    "doc" : "The timestamp, in unix time, that the request was made",
+    "default" : 0
+  }, {
+    "name" : "wikiId",
+    "type" : "string",
+    "doc" : "The wiki making this request, such as dewiki or enwiktionary",
+    "default" : ""
+  }, {
+    "name" : "source",
+    "type" : "string",
+    "doc" : "Where the request is coming from. Typically: web, api or cli",
+    "default" : ""
+  }, {
+    "name" : "identity",
+    "type" : "string",
+    "doc" : "A hash identifying the requestor. Includes the IP address and 
User Agent when available",
+    "default" : ""
+  }, {
+    "name" : "ip",
+    "type" : "string",
+    "doc" : "The IP address (either ipv4 or ipv6) in string notation",
+    "default" : ""
+  }, {
+    "name" : "userAgent",
+    "type" : "string",
+    "doc" : "The HTTP User-Agent header, or null if not-applicable",
+    "default" : ""
+  }, {
+    "name" : "backendUserTests",
+    "type" : {
+      "type" : "array",
+      "items" : "string"
+    },
+    "doc" : "List of backend tests the requests are participating in",
+    "default" : [ ]
+  }, {
+    "name" : "tookMs",
+    "type" : "float",
+    "doc" : "Total time of the php request in milliseconds",
+    "default" : -1
+  }, {
+    "name" : "payload",
+    "type" : {
+      "type" : "map",
+      "values" : "string"
+    },
+    "doc" : "General purpose data for this request set",
+    "default" : { }
+  }, {
+    "name" : "hits",
+    "type" : {
+      "type" : "array",
+      "items" : {
+        "type" : "record",
+        "name" : "CirrusSearchHit",
+        "doc" : "An individual search result",
+        "fields" : [ {
+          "name" : "title",
+          "type" : "string",
+          "doc" : "MediaWiki page title of the result",
+          "default" : ""
+        }, {
+          "name" : "index",
+          "type" : "string",
+          "doc" : "ElasticSearch index this result came from",
+          "default" : ""
+        }, {
+          "name" : "pageId",
+          "type" : "int",
+          "default" : -1
+        }, {
+          "name" : "score",
+          "type" : "float",
+          "doc" : "Score from ElasticSearch for this result",
+          "default" : -1
+        }, {
+          "name" : "profileName",
+          "type" : "string",
+          "doc" : "The profile name for comp_suggest queries",
+          "default" : ""
+        } ]
+      }
+    },
+    "doc" : "Final set of result pages returned for the php request.",
+    "default" : [ ]
+  }, {
+    "name" : "requests",
+    "type" : {
+      "type" : "array",
+      "items" : {
+        "type" : "record",
+        "name" : "CirrusSearchRequest",
+        "doc" : "An individual request made between MediaWiki and 
ElasticSearch",
+        "fields" : [ {
+          "name" : "query",
+          "type" : "string",
+          "doc" : "The actual search request",
+          "default" : ""
+        }, {
+          "name" : "queryType",
+          "type" : "string",
+          "default" : ""
+        }, {
+          "name" : "indices",
+          "type" : {
+            "type" : "array",
+            "items" : "string"
+          },
+          "default" : [ ]
+        }, {
+          "name" : "tookMs",
+          "type" : "int",
+          "doc" : "The number of milliseconds between passing the query to the 
client library and getting the response back in the application",
+          "default" : -1
+        }, {
+          "name" : "elasticTookMs",
+          "type" : "int",
+          "doc" : "The number of milliseconds the query took, according to the 
elasticsearch response",
+          "default" : -1
+        }, {
+          "name" : "limit",
+          "type" : "int",
+          "doc" : "The maximum number of results requested by the application",
+          "default" : -1
+        }, {
+          "name" : "hitsTotal",
+          "type" : "int",
+          "doc" : "The approximate total number of documents matching the 
query",
+          "default" : -1
+        }, {
+          "name" : "hitsReturned",
+          "type" : "int",
+          "doc" : "The number of results returned to the application",
+          "default" : -1
+        }, {
+          "name" : "hitsOffset",
+          "type" : "int",
+          "doc" : "The offset of the query",
+          "default" : -1
+        }, {
+          "name" : "namespaces",
+          "type" : {
+            "type" : "array",
+            "items" : "int"
+          },
+          "doc" : "Each element is a mediawiki namespace id that was searched",
+          "default" : [ ]
+        }, {
+          "name" : "suggestion",
+          "type" : "string",
+          "doc" : "The suggestion generated by elasticsearch",
+          "default" : ""
+        }, {
+          "name" : "suggestionRequested",
+          "type" : "boolean",
+          "doc" : "If a suggestion was requested from elasticsearch",
+          "default" : false
+        }, {
+          "name" : "maxScore",
+          "type" : "float",
+          "doc" : "Max score returned by elasticsearch, this is the best score 
in the\n     * results before we apply rescore queries.  Unfortunaltely we do 
not know if\n     * it's part or the result but it can give a rough idea of the 
score range\n     * before we apply the rescore queries.",
+          "default" : -1.0
+        }, {
+          "name" : "payload",
+          "type" : {
+            "type" : "map",
+            "values" : "string"
+          },
+          "doc" : "General purpose data for this request",
+          "default" : { }
+        }, {
+          "name" : "hits",
+          "type" : {
+            "type" : "array",
+            "items" : "CirrusSearchHit"
+          },
+          "doc" : "Final set of result pages returned for the ElasticSearch 
request",
+          "default" : [ ]
+        } ]
+      }
+    },
+    "doc" : "A list of requests made between mediawiki and elasticsearch in a 
single execution context",
+    "default" : [ ]
+  } ]
+}
diff --git a/avro/mediawiki/CirrusSearchRequestSet/CirrusSearchRequestSet.idl 
b/avro/mediawiki/CirrusSearchRequestSet/CirrusSearchRequestSet.idl
new file mode 100644
index 0000000..28fcdd0
--- /dev/null
+++ b/avro/mediawiki/CirrusSearchRequestSet/CirrusSearchRequestSet.idl
@@ -0,0 +1,84 @@
+@namespace("org.wikimedia.analytics.schemas")
+protocol CirrusSearchLogging {
+
+  /** An individual search result */
+  record CirrusSearchHit {
+    /** MediaWiki page title of the result */
+    string title = "";
+    /** ElasticSearch index this result came from */
+    string index = "";
+    /* MediaWiki page id. May be -1 for interwiki results */
+    int pageId = -1;
+    /** Score from ElasticSearch for this result */
+    float score = -1;
+       /** The profile name for comp_suggest queries */
+       string profileName = "";
+  }
+
+  /** An individual request made between MediaWiki and ElasticSearch */
+  record CirrusSearchRequest {
+    /** The actual search request */
+    string query = "";
+    /* The general type of query performed, such as full_text, prefix, etc. */
+    string queryType = "";
+    /* The list of indices the request was performed against */
+    array<string> indices = [];
+    /** The number of milliseconds between passing the query to the client 
library and getting the response back in the application */
+    int tookMs = -1;
+    /** The number of milliseconds the query took, according to the 
elasticsearch response */
+    int elasticTookMs = -1;
+    /** The maximum number of results requested by the application */
+    int limit = -1;
+    /** The approximate total number of documents matching the query */
+    int hitsTotal = -1;
+    /** The number of results returned to the application */
+    int hitsReturned = -1;
+    /** The offset of the query */
+    int hitsOffset = -1;
+    /** Each element is a mediawiki namespace id that was searched */
+    array<int> namespaces = [];
+    /** The suggestion generated by elasticsearch */
+    string suggestion = "";
+    /** If a suggestion was requested from elasticsearch */
+    boolean suggestionRequested = false;
+       /** Max score returned by elasticsearch, this is the best score in the
+     * results before we apply rescore queries.  Unfortunaltely we do not know 
if
+     * it's part or the result but it can give a rough idea of the score range
+     * before we apply the rescore queries.
+        */
+       float maxScore = -1.0;
+    /** General purpose data for this request */
+    map<string> payload = {};
+    /** Final set of result pages returned for the ElasticSearch request */
+    array<CirrusSearchHit> hits = [];
+  }
+
+  /** A set of requests made by CirrusSearch to the elasticsearch user for a 
single php execution context */
+  record CirrusSearchRequestSet {
+    /** UUIDv4 id of this request set */
+    string id = "";
+    /** The timestamp, in unix time, that the request was made */
+    int ts = 0;
+    /** The wiki making this request, such as dewiki or enwiktionary */
+    string wikiId = "";
+    /** Where the request is coming from. Typically: web, api or cli */
+    string source = "";
+    /** A hash identifying the requestor. Includes the IP address and User 
Agent when available */
+    string identity = "";
+    /** The IP address (either ipv4 or ipv6) in string notation */
+    string ip = "";
+    /** The HTTP User-Agent header, or null if not-applicable */
+    string userAgent = "";
+    /** List of backend tests the requests are participating in */
+    array<string> backendUserTests = [];
+    /** Total time of the php request in milliseconds */
+    float tookMs = -1;
+    /** General purpose data for this request set */
+    map<string> payload = {};
+    /** Final set of result pages returned for the php request. */
+    array<CirrusSearchHit> hits = [];
+    /** A list of requests made between mediawiki and elasticsearch in a 
single execution context */
+    array<CirrusSearchRequest> requests = [];
+  }
+
+}

-- 
To view, visit https://gerrit.wikimedia.org/r/274312
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I1f194cc318c18e7c292c5450ea011811de8df204
Gerrit-PatchSet: 4
Gerrit-Project: mediawiki/event-schemas
Gerrit-Branch: master
Gerrit-Owner: EBernhardson <[email protected]>
Gerrit-Reviewer: DCausse <[email protected]>
Gerrit-Reviewer: EBernhardson <[email protected]>
Gerrit-Reviewer: Gehel <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to