This is an automated email from the ASF dual-hosted git repository. ejones pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/flagon-distill.git
The following commit(s) were added to refs/heads/master by this push: new c127bc9 38 add support to label logs (#44) c127bc9 is described below commit c127bc9cb09d32bf67674349089dc797239cae65 Author: Broden222 <154264811+broden...@users.noreply.github.com> AuthorDate: Mon Jul 15 12:01:01 2024 -0400 38 add support to label logs (#44) * draft 38 add support to label logs * 38 add support label logs changes * 38 add support label logs changes * feature_definition, tranform, and test_transform updates * Should I create the process.py in the distill/process directory. Also, should the label function remain in the transform.py as well? * updated transform.py test_transform.py, and labels.py * test_transform.py * updated test_transform.py --- distill/core/feature_definition.py | 32 ++++ distill/process/transform.py | 21 +++ examples/Distill_Workflow_Example.ipynb | 278 ++++++++++++++++++-------------- examples/labels.py | 91 +++++++++++ tests/test_transform.py | 38 +++++ 5 files changed, 336 insertions(+), 124 deletions(-) diff --git a/distill/core/feature_definition.py b/distill/core/feature_definition.py new file mode 100644 index 0000000..7708740 --- /dev/null +++ b/distill/core/feature_definition.py @@ -0,0 +1,32 @@ +from typing import Any, Dict, List, Callable + +class FeatureDefinition: + + def __init__(self, label: str, rule: Callable[[Dict[str, Any]], bool]): + """ + Allows users to specify a rule or set of rules and an + associated label and then use this to add labels to + logs in Distill + + param label: a string we want to add to the log if the rule is met + param rule: must be a callable function which accepts a UserALE log + as an input, and returns a boolean of whether that rule was met or not + """ + if not callable(rule): + raise TypeError("Rule not callable") + + if not isinstance(label, str): + raise TypeError("Label is not a string") + + self.label = label + self._rule = rule + + def matches(self, log: Dict[str, Any]) -> bool: + """ + A wrapper method around the private rule attribute we + store on self during init + """ + return self._rule(log) + + + diff --git a/distill/process/transform.py b/distill/process/transform.py index 86d8011..6788173 100644 --- a/distill/process/transform.py +++ b/distill/process/transform.py @@ -15,6 +15,8 @@ # limitations under the License. import itertools +from typing import Any, Dict, List, Callable +from distill.core.feature_definition import FeatureDefinition def pairwiseStag(iterable, *, split: bool = False): @@ -52,3 +54,22 @@ def pairwiseSeq(iterable, *, split: bool = False): return list1, list2 else: return list(pairs) + +def label_features( + logs: List[Dict[str, Any]], definitions: List[FeatureDefinition] +) -> List[Dict[str, Any]]: + """ + Check whether a log matches the specified a rule or set of rules + and an associated label definition + + param logs: UserALE log + definitions: specified rule(s) and label + return: logs + """ + for log in logs: + for definition in definitions: + if definition.matches(log): + if "labels" not in log: + log.update({"labels": list()}) + log["labels"].append(definition.label) + return logs diff --git a/examples/Distill_Workflow_Example.ipynb b/examples/Distill_Workflow_Example.ipynb index aa99017..12a7309 100644 --- a/examples/Distill_Workflow_Example.ipynb +++ b/examples/Distill_Workflow_Example.ipynb @@ -55,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "9d2e506b", "metadata": {}, "outputs": [], @@ -100,7 +100,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "e9cbd628", "metadata": {}, "outputs": [], @@ -139,7 +139,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "ca532053", "metadata": {}, "outputs": [], @@ -168,7 +168,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "efb27f53", "metadata": {}, "outputs": [ @@ -199,7 +199,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "35cd3449", "metadata": {}, "outputs": [], @@ -243,7 +243,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "b6297d7f", "metadata": {}, "outputs": [ @@ -252,75 +252,18 @@ "text/html": [ " <script type=\"text/javascript\">\n", " window.PlotlyConfig = {MathJaxConfig: 'local'};\n", - " if (window.MathJax) {MathJax.Hub.Config({SVG: {font: \"STIX-Web\"}});}\n", + " if (window.MathJax && window.MathJax.Hub && window.MathJax.Hub.Config) {window.MathJax.Hub.Config({SVG: {font: \"STIX-Web\"}});}\n", " if (typeof require !== 'undefined') {\n", " require.undef(\"plotly\");\n", " define('plotly', function(require, exports, module) {\n", " /**\n", - "* plotly.js v2.8.3\n", - "* Copyright 2012-2021, Plotly, Inc.\n", + "* plotly.js v2.32.0\n", + "* Copyright 2012-2024, Plotly, Inc.\n", "* All rights reserved.\n", "* Licensed under the MIT license\n", "*/\n", - "!function(t){if(\"object\"==typeof exports&&\"undefined\"!=typeof module)module.exports=t();else if(\"function\"==typeof define&&define.amd)define([],t);else{(\"undefined\"!=typeof window?window:\"undefined\"!=typeof global?global:\"undefined\"!=typeof self?self:this).Plotly=t()}}((function(){return function t(e,r,n){function i(o,s){if(!r[o]){if(!e[o]){var l=\"function\"==typeof require&&require;if(!s&&l)return l(o,!0);if(a)return a(o,!0);var c=new Error(\"Cannot find module '\"+ [...] - "/*!\n", - " * The buffer module from node.js, for the browser.\n", - " *\n", - " * @author Feross Aboukhadijeh <fer...@feross.org> <http://feross.org>\n", - " * @license MIT\n", - " */function i(t,e){if(t===e)return 0;for(var r=t.length,n=e.length,i=0,a=Math.min(r,n);i<a;++i)if(t[i]!==e[i]){r=t[i],n=e[i];break}return r<n?-1:n<r?1:0}function a(t){return r.Buffer&&\"function\"==typeof r.Buffer.isBuffer?r.Buffer.isBuffer(t):!(null==t||!t._isBuffer)}var o=t(\"util/\"),s=Object.prototype.hasOwnProperty,l=Array.prototype.slice,c=\"foo\"===function(){}.name;function u(t){return Object.prototype.toString.call(t)}function f(t){return!a(t)&&(\"function\"==typeof r.Ar [...] - "/*!\n", - " * The buffer module from node.js, for the browser.\n", - " *\n", - " * @author Feross Aboukhadijeh <https://feross.org>\n", - " * @license MIT\n", - " */\n", - "\"use strict\";var e=t(\"base64-js\"),n=t(\"ieee754\");r.Buffer=a,r.SlowBuffer=function(t){+t!=t&&(t=0);return a.alloc(+t)},r.INSPECT_MAX_BYTES=50;function i(t){if(t>2147483647)throw new RangeError('The value \"'+t+'\" is invalid for option \"size\"');var e=new Uint8Array(t);return e.__proto__=a.prototype,e}function a(t,e,r){if(\"number\"==typeof t){if(\"string\"==typeof e)throw new TypeError('The \"string\" argument must be of type string. Received type number');return l(t)}retu [...] - "/*! Native Promise Only\n", - " v0.8.1 (c) Kyle Simpson\n", - " MIT License: http://getify.mit-license.org\n", - "*/\n", - "!function(t,r,n){r[t]=r[t]||n(),void 0!==e&&e.exports&&(e.exports=r[t])}(\"Promise\",void 0!==t?t:this,(function(){\"use strict\";var t,e,n,i=Object.prototype.toString,a=void 0!==r?function(t){return r(t)}:setTimeout;try{Object.defineProperty({},\"x\",{}),t=function(t,e,r,n){return Object.defineProperty(t,e,{value:r,writable:!0,configurable:!1!==n})}}catch(e){t=function(t,e,r){return t[e]=r,t}}function o(t,r){n.add(t,r),e||(e=a(n.drain))}function s(t){var e,r=typeof t;return null [...] - "/*\n", - "object-assign\n", - "(c) Sindre Sorhus\n", - "@license MIT\n", - "*/\n", - "\"use strict\";var n=Object.getOwnPropertySymbols,i=Object.prototype.hasOwnProperty,a=Object.prototype.propertyIsEnumerable;function o(t){if(null==t)throw new TypeError(\"Object.assign cannot be called with null or undefined\");return Object(t)}e.exports=function(){try{if(!Object.assign)return!1;var t=new String(\"abc\");if(t[5]=\"de\",\"5\"===Object.getOwnPropertyNames(t)[0])return!1;for(var e={},r=0;r<10;r++)e[\"_\"+String.fromCharCode(r)]=r;if(\"0123456789\"!==Object.getOwnPro [...] - "/*\n", - " * @copyright 2016 Sean Connelly (@voidqk), http://syntheti.cc\n", - " * @license MIT\n", - " * @preserve Project Home: https://github.com/voidqk/polybooljs\n", - " */\n", - "var n,i=t(\"./lib/build-log\"),a=t(\"./lib/epsilon\"),o=t(\"./lib/intersecter\"),s=t(\"./lib/segment-chainer\"),l=t(\"./lib/segment-selector\"),c=t(\"./lib/geojson\"),u=!1,f=a();function h(t,e,r){var i=n.segments(t),a=n.segments(e),o=r(n.combine(i,a));return n.polygon(o)}n={buildLog:function(t){return!0===t?u=i():!1===t&&(u=!1),!1!==u&&u.list},epsilon:function(t){return f.epsilon(t)},segments:function(t){var e=o(!0,f,u);return t.regions.forEach(e.addRegion),{segments:e.calculate( [...] - "/*!\n", - " * The buffer module from node.js, for the browser.\n", - " *\n", - " * @author Feross Aboukhadijeh <https://feross.org>\n", - " * @license MIT\n", - " */\n", - "\"use strict\";var e=t(\"base64-js\"),n=t(\"ieee754\");r.Buffer=a,r.SlowBuffer=function(t){+t!=t&&(t=0);return a.alloc(+t)},r.INSPECT_MAX_BYTES=50;function i(t){if(t>2147483647)throw new RangeError('The value \"'+t+'\" is invalid for option \"size\"');var e=new Uint8Array(t);return e.__proto__=a.prototype,e}function a(t,e,r){if(\"number\"==typeof t){if(\"string\"==typeof e)throw new TypeError('The \"string\" argument must be of type string. Received type number');return l(t)}retu [...] - "/*!\n", - " * Determine if an object is a Buffer\n", - " *\n", - " * @author Feross Aboukhadijeh <https://feross.org>\n", - " * @license MIT\n", - " */\n", - "e.exports=function(t){return null!=t&&(n(t)||function(t){return\"function\"==typeof t.readFloatLE&&\"function\"==typeof t.slice&&n(t.slice(0,0))}(t)||!!t._isBuffer)}},{}],238:[function(t,e,r){\"use strict\";e.exports=a,e.exports.isMobile=a,e.exports.default=a;var n=/(android|bb\\d+|meego).+mobile|avantgo|bada\\/|blackberry|blazer|compal|elaine|fennec|hiptop|iemobile|ip(hone|od)|iris|kindle|lge |maemo|midp|mmp|mobile.+firefox|netfront|opera m(ob|in)i|palm( os)?|phone|p(ixi|re)\\/| [...] - "/*!\n", - " * pad-left <https://github.com/jonschlinkert/pad-left>\n", - " *\n", - " * Copyright (c) 2014-2015, Jon Schlinkert.\n", - " * Licensed under the MIT license.\n", - " */\n", - "\"use strict\";var n=t(\"repeat-string\");e.exports=function(t,e,r){return n(r=void 0!==r?r+\"\":\" \",e)+t}},{\"repeat-string\":277}],265:[function(t,e,r){e.exports=function(t,e){e||(e=[0,\"\"]),t=String(t);var r=parseFloat(t,10);return e[0]=r,e[1]=t.match(/[\\d.\\-\\+]*\\s*(.*)/)[1]||\"\",e}},{}],266:[function(t,e,r){\"use strict\";e.exports=function(t,e){for(var r=0|e.length,i=t.length,a=[new Array(r),new Array(r)],o=0;o<r;++o)a[0][o]=[],a[1][o]=[];for(o=0;o<i;++o){var s=t[o]; [...] - "/*!\n", - " * repeat-string <https://github.com/jonschlinkert/repeat-string>\n", - " *\n", - " * Copyright (c) 2014-2015, Jon Schlinkert.\n", - " * Licensed under the MIT License.\n", - " */\n", - "\"use strict\";var n,i=\"\";e.exports=function(t,e){if(\"string\"!=typeof t)throw new TypeError(\"expected a string\");if(1===e)return t;if(2===e)return t+t;var r=t.length*e;if(n!==t||void 0===n)n=t,i=\"\";else if(i.length>=r)return i.substr(0,r);for(;r>i.length&&e>1;)1&e&&(i+=t),e>>=1,t+=t;return i=(i+=t).substr(0,r)}},{}],278:[function(t,e,r){(function(t){(function(){e.exports=t.performance&&t.performance.now?function(){return performance.now()}:Date.now||function(){return+new [...] + "/*! For license information please see plotly.min.js.LICENSE.txt */\n", + "!function(t,e){\"object\"==typeof exports&&\"object\"==typeof module?module.exports=e():\"function\"==typeof define&&define.amd?define([],e):\"object\"==typeof exports?exports.Plotly=e():t.Plotly=e()}(self,(function(){return function(){var t={79288:function(t,e,r){\"use strict\";var n=r(3400),i={\"X,X div\":'direction:ltr;font-family:\"Open Sans\",verdana,arial,sans-serif;margin:0;padding:0;',\"X input,X button\":'font-family:\"Open Sans\",verdana,arial,sans-serif;',\"X input:foc [...] " });\n", " require(['plotly'], function(Plotly) {\n", " window._Plotly = Plotly;\n", @@ -357,7 +300,10 @@ 332, 219 ], - "coloraxis": "coloraxis" + "coloraxis": "coloraxis", + "pattern": { + "shape": "" + } }, "name": "", "offsetgroup": "", @@ -810,11 +756,10 @@ ], "scatter": [ { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } + "fillpattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 }, "type": "scatter" } @@ -1277,9 +1222,9 @@ } }, "text/html": [ - "<div> <div id=\"4b5d8a28-52c4-47c2-a370-b9329ea22cd9\" class=\"plotly-graph-div\" style=\"height:525px; width:100%;\"></div> <script type=\"text/javascript\"> require([\"plotly\"], function(Plotly) { window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById(\"4b5d8a28-52c4-47c2-a370-b9329ea22cd9\")) { Plotly.newPlot( \"4b5 [...] + "<div> <div id=\"d0a7d06d-82d7-4d2b-ab49-69e9b60999db\" class=\"plotly-graph-div\" style=\"height:525px; width:100%;\"></div> <script type=\"text/javascript\"> require([\"plotly\"], function(Plotly) { window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById(\"d0a7d06d-82d7-4d2b-ab49-69e9b60999db\")) { Plotly.newPlot( \"d0a [...] " \n", - "var gd = document.getElementById('4b5d8a28-52c4-47c2-a370-b9329ea22cd9');\n", + "var gd = document.getElementById('d0a7d06d-82d7-4d2b-ab49-69e9b60999db');\n", "var x = new MutationObserver(function (mutations, observer) {{\n", " var display = window.getComputedStyle(gd).display;\n", " if (!display || display === 'none') {{\n", @@ -1330,7 +1275,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "6077e149", "metadata": {}, "outputs": [], @@ -1350,7 +1295,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "72ae06d7", "metadata": {}, "outputs": [], @@ -1376,7 +1321,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "07aeb50e", "metadata": {}, "outputs": [], @@ -1397,7 +1342,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "4d876bba", "metadata": {}, "outputs": [ @@ -1575,7 +1520,10 @@ 2, 17 ], - "coloraxis": "coloraxis" + "coloraxis": "coloraxis", + "pattern": { + "shape": "" + } }, "name": "", "offsetgroup": "", @@ -2178,11 +2126,10 @@ ], "scatter": [ { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } + "fillpattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 }, "type": "scatter" } @@ -2645,9 +2592,9 @@ } }, "text/html": [ - "<div> <div id=\"c6e15b56-d294-42f4-9ed5-414fbaf670dd\" class=\"plotly-graph-div\" style=\"height:525px; width:100%;\"></div> <script type=\"text/javascript\"> require([\"plotly\"], function(Plotly) { window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById(\"c6e15b56-d294-42f4-9ed5-414fbaf670dd\")) { Plotly.newPlot( \"c6e [...] + "<div> <div id=\"1b565e2f-938f-47fd-85c9-1a06be8ba205\" class=\"plotly-graph-div\" style=\"height:525px; width:100%;\"></div> <script type=\"text/javascript\"> require([\"plotly\"], function(Plotly) { window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById(\"1b565e2f-938f-47fd-85c9-1a06be8ba205\")) { Plotly.newPlot( \"1b5 [...] " \n", - "var gd = document.getElementById('c6e15b56-d294-42f4-9ed5-414fbaf670dd');\n", + "var gd = document.getElementById('1b565e2f-938f-47fd-85c9-1a06be8ba205');\n", "var x = new MutationObserver(function (mutations, observer) {{\n", " var display = window.getComputedStyle(gd).display;\n", " if (!display || display === 'none') {{\n", @@ -2691,7 +2638,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "id": "b34ee2a8", "metadata": {}, "outputs": [], @@ -2731,15 +2678,15 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "id": "99484625", "metadata": {}, "outputs": [ { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAb4AAAEuCAYAAADx63eqAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAABPQklEQVR4nO3dd3hUZdrH8e+ZXpKQhJbQpPcqSA1SpIiAIoKgdLHAoqzS17Jre9UgVUVFQQVBEBEVQUBUOgHpIh3pJQIhhLTJtPP+MQsLApLMTDKTmftzXbl2SeY8547i/OZ5zlMUVVVVhBBCiDChCXQBQgghREGS4BNCCBFWJPiEEEKEFQk+IYQQYUWCTwghRFiR4BNCCBFWJPiEEEKEFQk+IYQQYUWCTwghRFiR4BNCCBFWJPiEEEKEFQk+IYQQYUWCTwghRFiR4BNCCBFWJPiEEEKEFQk+IYQQYUWCTwghRFiR [...] + "image/png": "iVBORw0KGgoAAAANSUhEUgAAApQAAAHzCAYAAACe1o1DAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuNSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/xnp5ZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB0dklEQVR4nO3ddVxV9/8H8Nc55166BEQBFVusWYBOnR2bib3NOTtmz4Url66c0zlnzs6vsRmb3R2ozBZ1KCogFoL0jfP74/5gIihxz+Fe4PXcgwd643Pe1yn3dT8pyLIsg4iIiIgon0RLF0BEREREhRsDJRERERGZhYGSiIiIiMzCQElEREREZmGgJCIiIiKzMFASERERkVkYKImIiIjILAyURERERGQWBkoiIiIiMgsDJRERERGZhYGSiIiIiMzCQElEREREZmGgJCIiIiKzMFASERERkVkYKImIiIjILAyURERE [...] "text/plain": [ - "<Figure size 432x288 with 1 Axes>" + "<Figure size 640x480 with 1 Axes>" ] }, "metadata": {}, @@ -2760,15 +2707,15 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "id": "6145530b", "metadata": {}, "outputs": [ { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAb4AAAEuCAYAAADx63eqAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAA9N0lEQVR4nO3dd3RU1eI98H2nz6RSAyGEXqSXUENHehNQDFVQUASkKBJB9KHP8qRI9SGgCIQqKoYOUqVJkR4QpDwgtAQipE+9vz/mS36Empm5kztlf9ZiJSRzD3twmc0t5xxBFEURREREfkIhdwAiIqL8xOIjIiK/wuIjIiK/wuIjIiK/wuIjIiK/wuIjIiK/wuIjIiK/wuIjIiK/wuIjIiK/wuIjIiK/wuIjIiK/wuIjIiK/wuIjIiK/wuIjIiK/wuIjIiK/wuIjIiK/wuIjIiK/wuIjIiK/wuIjIiK/wuIjIiK/ [...] + "image/png": "iVBORw0KGgoAAAANSUhEUgAAApQAAAHzCAYAAACe1o1DAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuNSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/xnp5ZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABwsUlEQVR4nO3dd3xN9/8H8Ne592aIGLFFjJixi0RsEmKvqqBUqV2zWmq2tTqstmpUzaI2NUtLgtgSkZaGWDEjVmSIzHvv+f1xf7lfqYTEPSfnjtfz+8hD5N58Pu/rq/G6nymIoiiCiIiIiOgtqZQugIiIiIgsGwMlEREREZmEgZKIiIiITMJASUREREQmYaAkIiIiIpMwUBIRERGRSRgoiYiIiMgkDJREREREZBIGSiIiIiIyCQMlEREREZmEgZKIiIiITMJASUREREQmYaAkIiIiIpMwUBIRERGRSRgoiYiIiMgk [...] "text/plain": [ - "<Figure size 432x288 with 1 Axes>" + "<Figure size 640x480 with 1 Axes>" ] }, "metadata": {}, @@ -2789,15 +2736,15 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "id": "2caaec99", "metadata": {}, "outputs": [ { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAb4AAAEuCAYAAADx63eqAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAA8nElEQVR4nO3deZyN9f//8cdZ55w5M2OQJduESIgs2cmWqERFi8oyEsmWvUWLNlukQkW2CKkUWUKyZBdJ9n1fxpj9zJz1+v1xfnzzYZg5y1znzHndb7dumplzved5Cs95X8v7rVEURUEIIYQIE1q1AwghhBB5SYpPCCFEWJHiE0IIEVak+IQQQoQVKT4hhBBhRYpPCCFEWJHiE0IIEVak+IQQQoQVKT4hhBBhRYpPCCFEWJHiE0IIEVak+IQQQoQVKT4hhBBhRYpPCCFEWJHiE0IIEVak+IQQQoQVKT4hhBBhRYpP [...] + "image/png": "iVBORw0KGgoAAAANSUhEUgAAApQAAAHzCAYAAACe1o1DAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuNSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/xnp5ZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABf/ElEQVR4nO3dd3xN9/8H8Nc5595sQiJBUHsTJFaNElWjVFWXWZJSo4rqFzW6tOhWo2KE2K1WUW2p1agtaWKG2jNGRGTIvvee7x+XVJooyT0n547X8/f4Pb6Ve/M+76DNK58pyLIsg4iIiIioiEStGyAiIiIi28ZASUREREQWYaAkIiIiIoswUBIRERGRRRgoiYiIiMgiDJREREREZBEGSiIiIiKyCAMlEREREVmEgZKIiIiILMJASUREREQWYaAkIiIiIoswUBIRERGRRRgoiYiIiMgiDJREREREZBEGSiIiIiKy [...] "text/plain": [ - "<Figure size 432x288 with 1 Axes>" + "<Figure size 640x480 with 1 Axes>" ] }, "metadata": {}, @@ -2818,15 +2765,15 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "id": "3b614621", "metadata": {}, "outputs": [ { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAb4AAAEuCAYAAADx63eqAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAzYElEQVR4nO3deVxU5eI/8M/szLAqLoS44Za5oAiCgrm0mUuaplmWS3W/WmZut7Rs8aZWmtdM65bdbPFmLoVpi7tiingBhdxRERJFMBXZZ5/z+4NfXM0NZs7MmeXzfr16FTjnmc9ULz48z3nOOTJBEAQQERH5CLnUAYiIiFyJxUdERD6FxUdERD6FxUdERD6FxUdERD6FxUdERD6FxUdERD6FxUdERD6FxUdERD6FxUdERD6FxUdERD6FxUdERD6FxUdERD6FxUdERD6FxUdERD6FxUdERD6FxUdERD6FxUdERD6F [...] + "image/png": "iVBORw0KGgoAAAANSUhEUgAAApQAAAHzCAYAAACe1o1DAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuNSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/xnp5ZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABY10lEQVR4nO3dd3wUVdvG8WtmN4XQQgu910jvRUREQAEfC4gKKhZEAQGpdlHErqAi1YIICCgoSJOi8KCCtNB7kRp6hwRSduf9Iy95QAIk2Zlsyu/rhw+wm73PvQHJlTNzzjEsy7IEAAAApJLp7wYAAACQsREoAQAA4BMCJQAAAHxCoAQAAIBPCJQAAADwCYESAAAAPiFQAgAAwCcESgAAAPiEQAkAAACfECgBAADgEwIlAAAAfEKgBAAAgE8IlAAAAPAJgRIAAAA+IVACAADAJwRKAAAA+IRACQAAAJ8QKAEAAOAT [...] "text/plain": [ - "<Figure size 432x288 with 1 Axes>" + "<Figure size 640x480 with 1 Axes>" ] }, "metadata": {}, @@ -2864,7 +2811,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "id": "fc862bb2", "metadata": {}, "outputs": [ @@ -2874,7 +2821,7 @@ "0.0" ] }, - "execution_count": 16, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -2895,17 +2842,17 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "id": "16b277a9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "32.0" + "16.0" ] }, - "execution_count": 17, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -2916,17 +2863,17 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "id": "fcdd9b0d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "34.0" + "18.0" ] }, - "execution_count": 18, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -2937,17 +2884,17 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "id": "740addd5", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "38.0" + "22.0" ] }, - "execution_count": 19, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -2958,7 +2905,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "id": "e090592f", "metadata": {}, "outputs": [ @@ -2968,7 +2915,7 @@ "2.0" ] }, - "execution_count": 20, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -2979,17 +2926,17 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "id": "16ea3d74", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "18.0" + "8.0" ] }, - "execution_count": 21, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -3000,7 +2947,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "id": "2876323b", "metadata": {}, "outputs": [ @@ -3010,7 +2957,7 @@ "4.0" ] }, - "execution_count": 22, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -3029,7 +2976,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "id": "af75336a", "metadata": {}, "outputs": [ @@ -3039,7 +2986,7 @@ "21.333333333333332" ] }, - "execution_count": 23, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -3055,13 +3002,96 @@ "source": [ "This shows that the average of the edit distances between each of these session DiGraphs is 21.33." ] + }, + { + "cell_type": "markdown", + "id": "9f7e93b2", + "metadata": {}, + "source": [ + "## Feature Definition\n", + "A primitive that allows users to specify a rule or set of rules and an associated label and then use this to add labels to logs." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "6097bf2b", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Any, Dict, List\n", + "import pandas as pd\n", + "\n", + "class FeatureDefinition:\n", + " # Implement class logic\n", + " # TODO: Add a very specific type hint to the rule object:\n", + " # see: https://docs.python.org/3/library/typing.html#annotating-callable-objects\n", + " def __init__(self, label: str, rule: Dict[str, Any]):\n", + " if not callable(rule):\n", + " raise TypeError(\"Rule not callable\")\n", + "\n", + " self.label = label\n", + " self.__rule = rule\n", + "\n", + " # This is a wrapper method around the private rule attribute we\n", + " # store on self during init.\n", + " #\n", + " # Q: Why make the rule private and wrap the call to it in another method?\n", + " # A: This encapsulation allows us to expose a nicer set of behavior\n", + " # and naming conventions to both the user and ourselves as developers.\n", + " # In `label_features` below, you see that we can then check whether\n", + " # a log `matches` the definition which reads more like plain english\n", + " # and is an important part of writing clean, idiomatic python code.\n", + " # TODO: Implement this wrapper function by using the _rule attribute\n", + " def matches(self, log: Dict[str, Any]) -> bool:\n", + " if log == self.__rule:\n", + " return log\n", + "\n", + "\n", + "def label_features(\n", + " logs: List[Dict[str, Any]], definitions: List[FeatureDefinition]\n", + ") -> List[Dict[str, Any]]:\n", + " # Iterate through all the logs\n", + " for log in logs:\n", + " for definition in definitions:\n", + " if definition.matches(log):\n", + " if \"labels\" not in log:\n", + " log.update({\"labels\": list()})\n", + " log[\"labels\"].append(definition.label)\n", + " return logs\n", + "\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " json_file = input(\"Enter your json file path: \")\n", + " logs = pd.read_json(json_file)\n", + "\n", + " # Create a map rule to test out the FeatureDefinition with\n", + " def map_rule(log: Dict[str, Any]) -> bool:\n", + " return \"pageUrl\" in log and \"map\" in log[\"pageUrl\"]\n", + " \n", + " def container_rule(log: Dict[str, Any]) -> bool:\n", + " return \"path\" in log and \"container\" in log[\"path\"]\n", + " \n", + " def table_rule(log: Dict[str, Any]) -> bool:\n", + " return \"path\" in log and \"table\" in log[\"path\"]\n", + "\n", + "\n", + " map_page_definition = FeatureDefinition(rule=map_rule, label=\"map_page\")\n", + " container_path_definition = FeatureDefinition(rule=container_rule, label=\"container_path\")\n", + " table_path_definition = FeatureDefinition(rule=table_rule, label=\"table_path\")\n", + "\n", + " label_features(logs=logs, definitions=[map_page_definition])\n", + " label_features(logs=logs, definitions=[container_path_definition])\n", + " label_features(logs=logs, definitions=[table_path_definition])\n" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "distill", "language": "python", - "name": "python3" + "name": "distill" }, "language_info": { "codemirror_mode": { @@ -3073,7 +3103,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/examples/labels.py b/examples/labels.py new file mode 100644 index 0000000..9b55efb --- /dev/null +++ b/examples/labels.py @@ -0,0 +1,91 @@ +import os + +import distill +from tests import testing_utils +from tests.data_config import DATA_DIR +from typing import Any, Dict, List, Callable +import json + + + +class FeatureDefinition: + # Implement class logic + def __init__(self, label: str, rule: Callable[[Dict[str, Any]], bool]): + # Immediately validate the rule, so you can error + # out/exit early if it's invalid + if not callable(rule): + raise TypeError("Rule not callable") + + if not isinstance(label, str): + raise TypeError("Label is not a string") + + self.label = label + self._rule = rule + + # This is a wrapper method around the private rule attribute we + # store on self during init. + # + # Q: Why make the rule private and + # wrap the call to it in another method? + # A: This encapsulation allows us to expose a nicer set of behavior + # and naming conventions to both the user and ourselves as developers. + # In `label_features` below, you see that we can then check whether + # a log `matches` the definition which reads more like plain english + # and is an important part of writing clean, idiomatic python code. + def matches(self, log: Dict[str, Any]) -> bool: + return self._rule(log) + + +def label_features( + logs: List[Dict[str, Any]], definitions: List[FeatureDefinition] +) -> List[Dict[str, Any]]: + # Iterate through all the logs + for log in logs: + # Check whether the log matches the definition + # for each definition supplied in the defintions list + for definition in definitions: + # NOTE: This reads much like an English sentence + # and is self-explanatory. I don't need to read the + # implementation logic to get a sense of what's happening + if definition.matches(log): + # NOTE: Since we're mutating the log itself and interacting + # with a field that may (does) not already exists, we need + # to first check if it is present in our log and instantiate + # it if not. + if "labels" not in log: + log.update({"labels": list()}) + log["labels"].append(definition.label) + return logs + + +########################################################### +# Example of how the FeatureDefintion class works +# +# The following if __name__ == "__main__" syntax +# is a way to tell python that if your run this file +# as a script from the command line, then this is the code +# that needs to be executed. +########################################################### +if __name__ == "__main__": + file = open(os.path.join(DATA_DIR, "sample_data.json"), "r") + logs = json.load(file) + + # Rule to test out the FeatureDefinition with + def type_rule(log): + return "type" in log and "scroll" in log["type"] + + # Definitions to test out the FeatureDefinition with + type_rule_definition = FeatureDefinition(rule=type_rule, label="scroll_type") + rule_not_callable_definintion = FeatureDefinition(rule="rule", label="scroll_type") + string_error_definition = FeatureDefinition(rule=type_rule, label= 10) + + # Call label feature function to test the 3 definitions + label_features(logs=logs, definitions=[type_rule_definition]) + label_features(logs=logs, definitions=[rule_not_callable_definintion]) + label_features(logs=logs, definitions=[string_error_definition]) + + + + + + diff --git a/tests/test_transform.py b/tests/test_transform.py index c49c282..ce9a713 100644 --- a/tests/test_transform.py +++ b/tests/test_transform.py @@ -14,7 +14,17 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os + import distill +from tests import testing_utils +from tests.data_config import DATA_DIR +import pytest +import json +from distill.core.feature_definition import FeatureDefinition +from typing import Any, Dict, List, Callable +from distill.process.transform import label_features + def test_pairwiseStag_1(): @@ -39,3 +49,31 @@ def test_pairwiseSeq_2(): test_list = [1, 2, 3, 4] result = distill.pairwiseSeq(test_list, split=True) assert result == ((1, 2, 3), (2, 3, 4)) + +def test_label_features(): + file = open(os.path.join(DATA_DIR, "sample_data.json"), "r") + logs = json.load(file) + def type_rule(log) -> bool: + return "type" in log and "scroll" in log["type"] + result = label_features(logs,[FeatureDefinition(rule=type_rule, label="scroll_type")]) + assert isinstance(result, list) + assert "labels" in set().union(*result) + assert 'labels', 'scroll_type' in result[1].items() + + +def test_feature_definition_does_not_accept_non_string_label(): + with pytest.raises(TypeError): + file = open(os.path.join(DATA_DIR, "sample_data.json"), "r") + logs = json.load(file) + def input_rule(log): + return "target" in log and "input" in log["target"] + result = label_features(logs,[FeatureDefinition(rule=input_rule, label=10)]) + +def test_feature_definition_does_not_accept_non_callable_rules(): + with pytest.raises(TypeError): + file = open(os.path.join(DATA_DIR, "sample_data.json"), "r") + logs = json.load(file) + result = label_features(logs,[FeatureDefinition(rule="input_rule", label="input_target")]) + + +