This is an automated email from the ASF dual-hosted git repository.

ejones pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/flagon-distill.git


The following commit(s) were added to refs/heads/master by this push:
     new c127bc9  38 add support to label logs (#44)
c127bc9 is described below

commit c127bc9cb09d32bf67674349089dc797239cae65
Author: Broden222 <154264811+broden...@users.noreply.github.com>
AuthorDate: Mon Jul 15 12:01:01 2024 -0400

    38 add support to label logs (#44)
    
    * draft 38 add support to label logs
    
    * 38 add support label logs changes
    
    * 38 add support label logs changes
    
    * feature_definition, tranform, and test_transform updates
    
    * Should I create the process.py in the distill/process directory. Also, 
should the label function remain in the transform.py as well?
    
    * updated transform.py test_transform.py, and labels.py
    
    * test_transform.py
    
    * updated test_transform.py
---
 distill/core/feature_definition.py      |  32 ++++
 distill/process/transform.py            |  21 +++
 examples/Distill_Workflow_Example.ipynb | 278 ++++++++++++++++++--------------
 examples/labels.py                      |  91 +++++++++++
 tests/test_transform.py                 |  38 +++++
 5 files changed, 336 insertions(+), 124 deletions(-)

diff --git a/distill/core/feature_definition.py 
b/distill/core/feature_definition.py
new file mode 100644
index 0000000..7708740
--- /dev/null
+++ b/distill/core/feature_definition.py
@@ -0,0 +1,32 @@
+from typing import Any, Dict, List, Callable
+
+class FeatureDefinition:
+
+    def __init__(self, label: str, rule: Callable[[Dict[str, Any]], bool]):
+        """
+        Allows users to specify a rule or set of rules and an 
+        associated label and then use this to add labels to  
+        logs in Distill
+        
+        param label: a string we want to add to the log if the rule is met
+        param rule: must be a callable function which accepts a UserALE log 
+            as an input, and returns a boolean of whether that rule was met or 
not
+        """
+        if not callable(rule):
+            raise TypeError("Rule not callable")
+        
+        if not isinstance(label, str):
+            raise TypeError("Label is not a string")
+
+        self.label = label
+        self._rule = rule
+
+    def matches(self, log: Dict[str, Any]) -> bool:
+            """
+            A wrapper method around the private rule attribute we
+            store on self during init
+            """
+            return self._rule(log)
+
+
+
diff --git a/distill/process/transform.py b/distill/process/transform.py
index 86d8011..6788173 100644
--- a/distill/process/transform.py
+++ b/distill/process/transform.py
@@ -15,6 +15,8 @@
 # limitations under the License.
 
 import itertools
+from typing import Any, Dict, List, Callable
+from distill.core.feature_definition import FeatureDefinition
 
 
 def pairwiseStag(iterable, *, split: bool = False):
@@ -52,3 +54,22 @@ def pairwiseSeq(iterable, *, split: bool = False):
         return list1, list2
     else:
         return list(pairs)
+    
+def label_features(
+    logs: List[Dict[str, Any]], definitions: List[FeatureDefinition]
+) -> List[Dict[str, Any]]:
+    """
+    Check whether a log matches the specified a rule or set of rules 
+    and an associated label definition
+
+    param logs: UserALE log
+    definitions: specified rule(s) and label
+    return: logs
+    """
+    for log in logs:
+        for definition in definitions:
+            if definition.matches(log):
+                if "labels" not in log:
+                    log.update({"labels": list()})
+                log["labels"].append(definition.label)
+    return logs
diff --git a/examples/Distill_Workflow_Example.ipynb 
b/examples/Distill_Workflow_Example.ipynb
index aa99017..12a7309 100644
--- a/examples/Distill_Workflow_Example.ipynb
+++ b/examples/Distill_Workflow_Example.ipynb
@@ -55,7 +55,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "id": "9d2e506b",
    "metadata": {},
    "outputs": [],
@@ -100,7 +100,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "id": "e9cbd628",
    "metadata": {},
    "outputs": [],
@@ -139,7 +139,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "id": "ca532053",
    "metadata": {},
    "outputs": [],
@@ -168,7 +168,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "id": "efb27f53",
    "metadata": {},
    "outputs": [
@@ -199,7 +199,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "id": "35cd3449",
    "metadata": {},
    "outputs": [],
@@ -243,7 +243,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "id": "b6297d7f",
    "metadata": {},
    "outputs": [
@@ -252,75 +252,18 @@
       "text/html": [
        "        <script type=\"text/javascript\">\n",
        "        window.PlotlyConfig = {MathJaxConfig: 'local'};\n",
-       "        if (window.MathJax) {MathJax.Hub.Config({SVG: {font: 
\"STIX-Web\"}});}\n",
+       "        if (window.MathJax && window.MathJax.Hub && 
window.MathJax.Hub.Config) {window.MathJax.Hub.Config({SVG: {font: 
\"STIX-Web\"}});}\n",
        "        if (typeof require !== 'undefined') {\n",
        "        require.undef(\"plotly\");\n",
        "        define('plotly', function(require, exports, module) {\n",
        "            /**\n",
-       "* plotly.js v2.8.3\n",
-       "* Copyright 2012-2021, Plotly, Inc.\n",
+       "* plotly.js v2.32.0\n",
+       "* Copyright 2012-2024, Plotly, Inc.\n",
        "* All rights reserved.\n",
        "* Licensed under the MIT license\n",
        "*/\n",
-       "!function(t){if(\"object\"==typeof exports&&\"undefined\"!=typeof 
module)module.exports=t();else if(\"function\"==typeof 
define&&define.amd)define([],t);else{(\"undefined\"!=typeof 
window?window:\"undefined\"!=typeof global?global:\"undefined\"!=typeof 
self?self:this).Plotly=t()}}((function(){return function t(e,r,n){function 
i(o,s){if(!r[o]){if(!e[o]){var l=\"function\"==typeof 
require&&require;if(!s&&l)return l(o,!0);if(a)return a(o,!0);var c=new 
Error(\"Cannot find module '\"+ [...]
-       "/*!\n",
-       " * The buffer module from node.js, for the browser.\n",
-       " *\n",
-       " * @author   Feross Aboukhadijeh <fer...@feross.org> 
<http://feross.org>\n",
-       " * @license  MIT\n",
-       " */function i(t,e){if(t===e)return 0;for(var 
r=t.length,n=e.length,i=0,a=Math.min(r,n);i<a;++i)if(t[i]!==e[i]){r=t[i],n=e[i];break}return
 r<n?-1:n<r?1:0}function a(t){return r.Buffer&&\"function\"==typeof 
r.Buffer.isBuffer?r.Buffer.isBuffer(t):!(null==t||!t._isBuffer)}var 
o=t(\"util/\"),s=Object.prototype.hasOwnProperty,l=Array.prototype.slice,c=\"foo\"===function(){}.name;function
 u(t){return Object.prototype.toString.call(t)}function 
f(t){return!a(t)&&(\"function\"==typeof r.Ar [...]
-       "/*!\n",
-       " * The buffer module from node.js, for the browser.\n",
-       " *\n",
-       " * @author   Feross Aboukhadijeh <https://feross.org>\n",
-       " * @license  MIT\n",
-       " */\n",
-       "\"use strict\";var 
e=t(\"base64-js\"),n=t(\"ieee754\");r.Buffer=a,r.SlowBuffer=function(t){+t!=t&&(t=0);return
 a.alloc(+t)},r.INSPECT_MAX_BYTES=50;function i(t){if(t>2147483647)throw new 
RangeError('The value \"'+t+'\" is invalid for option \"size\"');var e=new 
Uint8Array(t);return e.__proto__=a.prototype,e}function 
a(t,e,r){if(\"number\"==typeof t){if(\"string\"==typeof e)throw new 
TypeError('The \"string\" argument must be of type string. Received type 
number');return l(t)}retu [...]
-       "/*! Native Promise Only\n",
-       "    v0.8.1 (c) Kyle Simpson\n",
-       "    MIT License: http://getify.mit-license.org\n";,
-       "*/\n",
-       "!function(t,r,n){r[t]=r[t]||n(),void 
0!==e&&e.exports&&(e.exports=r[t])}(\"Promise\",void 
0!==t?t:this,(function(){\"use strict\";var 
t,e,n,i=Object.prototype.toString,a=void 0!==r?function(t){return 
r(t)}:setTimeout;try{Object.defineProperty({},\"x\",{}),t=function(t,e,r,n){return
 
Object.defineProperty(t,e,{value:r,writable:!0,configurable:!1!==n})}}catch(e){t=function(t,e,r){return
 t[e]=r,t}}function o(t,r){n.add(t,r),e||(e=a(n.drain))}function s(t){var 
e,r=typeof t;return null [...]
-       "/*\n",
-       "object-assign\n",
-       "(c) Sindre Sorhus\n",
-       "@license MIT\n",
-       "*/\n",
-       "\"use strict\";var 
n=Object.getOwnPropertySymbols,i=Object.prototype.hasOwnProperty,a=Object.prototype.propertyIsEnumerable;function
 o(t){if(null==t)throw new TypeError(\"Object.assign cannot be called with null 
or undefined\");return 
Object(t)}e.exports=function(){try{if(!Object.assign)return!1;var t=new 
String(\"abc\");if(t[5]=\"de\",\"5\"===Object.getOwnPropertyNames(t)[0])return!1;for(var
 
e={},r=0;r<10;r++)e[\"_\"+String.fromCharCode(r)]=r;if(\"0123456789\"!==Object.getOwnPro
 [...]
-       "/*\n",
-       " * @copyright 2016 Sean Connelly (@voidqk), http://syntheti.cc\n";,
-       " * @license MIT\n",
-       " * @preserve Project Home: https://github.com/voidqk/polybooljs\n";,
-       " */\n",
-       "var 
n,i=t(\"./lib/build-log\"),a=t(\"./lib/epsilon\"),o=t(\"./lib/intersecter\"),s=t(\"./lib/segment-chainer\"),l=t(\"./lib/segment-selector\"),c=t(\"./lib/geojson\"),u=!1,f=a();function
 h(t,e,r){var i=n.segments(t),a=n.segments(e),o=r(n.combine(i,a));return 
n.polygon(o)}n={buildLog:function(t){return!0===t?u=i():!1===t&&(u=!1),!1!==u&&u.list},epsilon:function(t){return
 f.epsilon(t)},segments:function(t){var e=o(!0,f,u);return 
t.regions.forEach(e.addRegion),{segments:e.calculate( [...]
-       "/*!\n",
-       " * The buffer module from node.js, for the browser.\n",
-       " *\n",
-       " * @author   Feross Aboukhadijeh <https://feross.org>\n",
-       " * @license  MIT\n",
-       " */\n",
-       "\"use strict\";var 
e=t(\"base64-js\"),n=t(\"ieee754\");r.Buffer=a,r.SlowBuffer=function(t){+t!=t&&(t=0);return
 a.alloc(+t)},r.INSPECT_MAX_BYTES=50;function i(t){if(t>2147483647)throw new 
RangeError('The value \"'+t+'\" is invalid for option \"size\"');var e=new 
Uint8Array(t);return e.__proto__=a.prototype,e}function 
a(t,e,r){if(\"number\"==typeof t){if(\"string\"==typeof e)throw new 
TypeError('The \"string\" argument must be of type string. Received type 
number');return l(t)}retu [...]
-       "/*!\n",
-       " * Determine if an object is a Buffer\n",
-       " *\n",
-       " * @author   Feross Aboukhadijeh <https://feross.org>\n",
-       " * @license  MIT\n",
-       " */\n",
-       "e.exports=function(t){return 
null!=t&&(n(t)||function(t){return\"function\"==typeof 
t.readFloatLE&&\"function\"==typeof 
t.slice&&n(t.slice(0,0))}(t)||!!t._isBuffer)}},{}],238:[function(t,e,r){\"use 
strict\";e.exports=a,e.exports.isMobile=a,e.exports.default=a;var 
n=/(android|bb\\d+|meego).+mobile|avantgo|bada\\/|blackberry|blazer|compal|elaine|fennec|hiptop|iemobile|ip(hone|od)|iris|kindle|lge
 |maemo|midp|mmp|mobile.+firefox|netfront|opera m(ob|in)i|palm( 
os)?|phone|p(ixi|re)\\/| [...]
-       "/*!\n",
-       " * pad-left <https://github.com/jonschlinkert/pad-left>\n",
-       " *\n",
-       " * Copyright (c) 2014-2015, Jon Schlinkert.\n",
-       " * Licensed under the MIT license.\n",
-       " */\n",
-       "\"use strict\";var 
n=t(\"repeat-string\");e.exports=function(t,e,r){return n(r=void 
0!==r?r+\"\":\" 
\",e)+t}},{\"repeat-string\":277}],265:[function(t,e,r){e.exports=function(t,e){e||(e=[0,\"\"]),t=String(t);var
 r=parseFloat(t,10);return 
e[0]=r,e[1]=t.match(/[\\d.\\-\\+]*\\s*(.*)/)[1]||\"\",e}},{}],266:[function(t,e,r){\"use
 strict\";e.exports=function(t,e){for(var r=0|e.length,i=t.length,a=[new 
Array(r),new Array(r)],o=0;o<r;++o)a[0][o]=[],a[1][o]=[];for(o=0;o<i;++o){var 
s=t[o]; [...]
-       "/*!\n",
-       " * repeat-string <https://github.com/jonschlinkert/repeat-string>\n",
-       " *\n",
-       " * Copyright (c) 2014-2015, Jon Schlinkert.\n",
-       " * Licensed under the MIT License.\n",
-       " */\n",
-       "\"use strict\";var 
n,i=\"\";e.exports=function(t,e){if(\"string\"!=typeof t)throw new 
TypeError(\"expected a string\");if(1===e)return t;if(2===e)return t+t;var 
r=t.length*e;if(n!==t||void 0===n)n=t,i=\"\";else if(i.length>=r)return 
i.substr(0,r);for(;r>i.length&&e>1;)1&e&&(i+=t),e>>=1,t+=t;return 
i=(i+=t).substr(0,r)}},{}],278:[function(t,e,r){(function(t){(function(){e.exports=t.performance&&t.performance.now?function(){return
 performance.now()}:Date.now||function(){return+new  [...]
+       "/*! For license information please see plotly.min.js.LICENSE.txt */\n",
+       "!function(t,e){\"object\"==typeof exports&&\"object\"==typeof 
module?module.exports=e():\"function\"==typeof 
define&&define.amd?define([],e):\"object\"==typeof 
exports?exports.Plotly=e():t.Plotly=e()}(self,(function(){return function(){var 
t={79288:function(t,e,r){\"use strict\";var n=r(3400),i={\"X,X 
div\":'direction:ltr;font-family:\"Open 
Sans\",verdana,arial,sans-serif;margin:0;padding:0;',\"X input,X 
button\":'font-family:\"Open Sans\",verdana,arial,sans-serif;',\"X input:foc 
[...]
        "        });\n",
        "        require(['plotly'], function(Plotly) {\n",
        "            window._Plotly = Plotly;\n",
@@ -357,7 +300,10 @@
            332,
            219
           ],
-          "coloraxis": "coloraxis"
+          "coloraxis": "coloraxis",
+          "pattern": {
+           "shape": ""
+          }
          },
          "name": "",
          "offsetgroup": "",
@@ -810,11 +756,10 @@
           ],
           "scatter": [
            {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
+            "fillpattern": {
+             "fillmode": "overlay",
+             "size": 10,
+             "solidity": 0.2
             },
             "type": "scatter"
            }
@@ -1277,9 +1222,9 @@
        }
       },
       "text/html": [
-       "<div>                            <div 
id=\"4b5d8a28-52c4-47c2-a370-b9329ea22cd9\" class=\"plotly-graph-div\" 
style=\"height:525px; width:100%;\"></div>            <script 
type=\"text/javascript\">                require([\"plotly\"], function(Plotly) 
{                    window.PLOTLYENV=window.PLOTLYENV || {};                   
                 if 
(document.getElementById(\"4b5d8a28-52c4-47c2-a370-b9329ea22cd9\")) {           
         Plotly.newPlot(                        \"4b5 [...]
+       "<div>                            <div 
id=\"d0a7d06d-82d7-4d2b-ab49-69e9b60999db\" class=\"plotly-graph-div\" 
style=\"height:525px; width:100%;\"></div>            <script 
type=\"text/javascript\">                require([\"plotly\"], function(Plotly) 
{                    window.PLOTLYENV=window.PLOTLYENV || {};                   
                 if 
(document.getElementById(\"d0a7d06d-82d7-4d2b-ab49-69e9b60999db\")) {           
         Plotly.newPlot(                        \"d0a [...]
        "                            \n",
-       "var gd = 
document.getElementById('4b5d8a28-52c4-47c2-a370-b9329ea22cd9');\n",
+       "var gd = 
document.getElementById('d0a7d06d-82d7-4d2b-ab49-69e9b60999db');\n",
        "var x = new MutationObserver(function (mutations, observer) {{\n",
        "        var display = window.getComputedStyle(gd).display;\n",
        "        if (!display || display === 'none') {{\n",
@@ -1330,7 +1275,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "id": "6077e149",
    "metadata": {},
    "outputs": [],
@@ -1350,7 +1295,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "id": "72ae06d7",
    "metadata": {},
    "outputs": [],
@@ -1376,7 +1321,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "id": "07aeb50e",
    "metadata": {},
    "outputs": [],
@@ -1397,7 +1342,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "id": "4d876bba",
    "metadata": {},
    "outputs": [
@@ -1575,7 +1520,10 @@
            2,
            17
           ],
-          "coloraxis": "coloraxis"
+          "coloraxis": "coloraxis",
+          "pattern": {
+           "shape": ""
+          }
          },
          "name": "",
          "offsetgroup": "",
@@ -2178,11 +2126,10 @@
           ],
           "scatter": [
            {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
+            "fillpattern": {
+             "fillmode": "overlay",
+             "size": 10,
+             "solidity": 0.2
             },
             "type": "scatter"
            }
@@ -2645,9 +2592,9 @@
        }
       },
       "text/html": [
-       "<div>                            <div 
id=\"c6e15b56-d294-42f4-9ed5-414fbaf670dd\" class=\"plotly-graph-div\" 
style=\"height:525px; width:100%;\"></div>            <script 
type=\"text/javascript\">                require([\"plotly\"], function(Plotly) 
{                    window.PLOTLYENV=window.PLOTLYENV || {};                   
                 if 
(document.getElementById(\"c6e15b56-d294-42f4-9ed5-414fbaf670dd\")) {           
         Plotly.newPlot(                        \"c6e [...]
+       "<div>                            <div 
id=\"1b565e2f-938f-47fd-85c9-1a06be8ba205\" class=\"plotly-graph-div\" 
style=\"height:525px; width:100%;\"></div>            <script 
type=\"text/javascript\">                require([\"plotly\"], function(Plotly) 
{                    window.PLOTLYENV=window.PLOTLYENV || {};                   
                 if 
(document.getElementById(\"1b565e2f-938f-47fd-85c9-1a06be8ba205\")) {           
         Plotly.newPlot(                        \"1b5 [...]
        "                            \n",
-       "var gd = 
document.getElementById('c6e15b56-d294-42f4-9ed5-414fbaf670dd');\n",
+       "var gd = 
document.getElementById('1b565e2f-938f-47fd-85c9-1a06be8ba205');\n",
        "var x = new MutationObserver(function (mutations, observer) {{\n",
        "        var display = window.getComputedStyle(gd).display;\n",
        "        if (!display || display === 'none') {{\n",
@@ -2691,7 +2638,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 12,
    "id": "b34ee2a8",
    "metadata": {},
    "outputs": [],
@@ -2731,15 +2678,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 13,
    "id": "99484625",
    "metadata": {},
    "outputs": [
     {
      "data": {
-      "image/png": 
"iVBORw0KGgoAAAANSUhEUgAAAb4AAAEuCAYAAADx63eqAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAABPQklEQVR4nO3dd3hUZdrH8e+ZXpKQhJbQpPcqSA1SpIiAIoKgdLHAoqzS17Jre9UgVUVFQQVBEBEVQUBUOgHpIh3pJQIhhLTJtPP+MQsLApLMTDKTmftzXbl2SeY8547i/OZ5zlMUVVVVhBBCiDChCXQBQgghREGS4BNCCBFWJPiEEEKEFQk+IYQQYUWCTwghRFiR4BNCCBFWJPiEEEKEFQk+IYQQYUWCTwghRFiR4BNCCBFWJPiEEEKEFQk+IYQQYUWCTwghRFiR4BNCCBFWJPiEEEKEFQk+IYQQYUWCTwghRFiR
 [...]
+      "image/png": 
"iVBORw0KGgoAAAANSUhEUgAAApQAAAHzCAYAAACe1o1DAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuNSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/xnp5ZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB0dklEQVR4nO3ddVxV9/8H8Nc55166BEQBFVusWYBOnR2bib3NOTtmz4Url66c0zlnzs6vsRmb3R2ozBZ1KCogFoL0jfP74/5gIihxz+Fe4PXcgwd643Pe1yn3dT8pyLIsg4iIiIgon0RLF0BEREREhRsDJRERERGZhYGSiIiIiMzCQElEREREZmGgJCIiIiKzMFASERERkVkYKImIiIjILAyURERERGQWBkoiIiIiMgsDJRERERGZhYGSiIiIiMzCQElEREREZmGgJCIiIiKzMFASERERkVkYKImIiIjILAyURERE
 [...]
       "text/plain": [
-       "<Figure size 432x288 with 1 Axes>"
+       "<Figure size 640x480 with 1 Axes>"
       ]
      },
      "metadata": {},
@@ -2760,15 +2707,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 14,
    "id": "6145530b",
    "metadata": {},
    "outputs": [
     {
      "data": {
-      "image/png": 
"iVBORw0KGgoAAAANSUhEUgAAAb4AAAEuCAYAAADx63eqAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAA9N0lEQVR4nO3dd3RU1eI98H2nz6RSAyGEXqSXUENHehNQDFVQUASkKBJB9KHP8qRI9SGgCIQqKoYOUqVJkR4QpDwgtAQipE+9vz/mS36Empm5kztlf9ZiJSRzD3twmc0t5xxBFEURREREfkIhdwAiIqL8xOIjIiK/wuIjIiK/wuIjIiK/wuIjIiK/wuIjIiK/wuIjIiK/wuIjIiK/wuIjIiK/wuIjIiK/wuIjIiK/wuIjIiK/wuIjIiK/wuIjIiK/wuIjIiK/wuIjIiK/wuIjIiK/wuIjIiK/wuIjIiK/wuIjIiK/
 [...]
+      "image/png": 
"iVBORw0KGgoAAAANSUhEUgAAApQAAAHzCAYAAACe1o1DAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuNSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/xnp5ZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABwsUlEQVR4nO3dd3xN9/8H8Ne592aIGLFFjJixi0RsEmKvqqBUqV2zWmq2tTqstmpUzaI2NUtLgtgSkZaGWDEjVmSIzHvv+f1xf7lfqYTEPSfnjtfz+8hD5N58Pu/rq/G6nymIoiiCiIiIiOgtqZQugIiIiIgsGwMlEREREZmEgZKIiIiITMJASUREREQmYaAkIiIiIpMwUBIRERGRSRgoiYiIiMgkDJREREREZBIGSiIiIiIyCQMlEREREZmEgZKIiIiITMJASUREREQmYaAkIiIiIpMwUBIRERGRSRgoiYiIiMgk
 [...]
       "text/plain": [
-       "<Figure size 432x288 with 1 Axes>"
+       "<Figure size 640x480 with 1 Axes>"
       ]
      },
      "metadata": {},
@@ -2789,15 +2736,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 15,
    "id": "2caaec99",
    "metadata": {},
    "outputs": [
     {
      "data": {
-      "image/png": 
"iVBORw0KGgoAAAANSUhEUgAAAb4AAAEuCAYAAADx63eqAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAA8nElEQVR4nO3deZyN9f//8cdZ55w5M2OQJduESIgs2cmWqERFi8oyEsmWvUWLNlukQkW2CKkUWUKyZBdJ9n1fxpj9zJz1+v1xfnzzYZg5y1znzHndb7dumplzved5Cs95X8v7rVEURUEIIYQIE1q1AwghhBB5SYpPCCFEWJHiE0IIEVak+IQQQoQVKT4hhBBhRYpPCCFEWJHiE0IIEVak+IQQQoQVKT4hhBBhRYpPCCFEWJHiE0IIEVak+IQQQoQVKT4hhBBhRYpPCCFEWJHiE0IIEVak+IQQQoQVKT4hhBBhRYpP
 [...]
+      "image/png": 
"iVBORw0KGgoAAAANSUhEUgAAApQAAAHzCAYAAACe1o1DAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuNSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/xnp5ZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABf/ElEQVR4nO3dd3xN9/8H8Nc5595sQiJBUHsTJFaNElWjVFWXWZJSo4rqFzW6tOhWo2KE2K1WUW2p1agtaWKG2jNGRGTIvvee7x+XVJooyT0n547X8/f4Pb6Ve/M+76DNK58pyLIsg4iIiIioiEStGyAiIiIi28ZASUREREQWYaAkIiIiIoswUBIRERGRRRgoiYiIiMgiDJREREREZBEGSiIiIiKyCAMlEREREVmEgZKIiIiILMJASUREREQWYaAkIiIiIoswUBIRERGRRRgoiYiIiMgiDJREREREZBEGSiIiIiKy
 [...]
       "text/plain": [
-       "<Figure size 432x288 with 1 Axes>"
+       "<Figure size 640x480 with 1 Axes>"
       ]
      },
      "metadata": {},
@@ -2818,15 +2765,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 16,
    "id": "3b614621",
    "metadata": {},
    "outputs": [
     {
      "data": {
-      "image/png": 
"iVBORw0KGgoAAAANSUhEUgAAAb4AAAEuCAYAAADx63eqAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAzYElEQVR4nO3deVxU5eI/8M/szLAqLoS44Za5oAiCgrm0mUuaplmWS3W/WmZut7Rs8aZWmtdM65bdbPFmLoVpi7tiingBhdxRERJFMBXZZ5/z+4NfXM0NZs7MmeXzfr16FTjnmc9ULz48z3nOOTJBEAQQERH5CLnUAYiIiFyJxUdERD6FxUdERD6FxUdERD6FxUdERD6FxUdERD6FxUdERD6FxUdERD6FxUdERD6FxUdERD6FxUdERD6FxUdERD6FxUdERD6FxUdERD6FxUdERD6FxUdERD6FxUdERD6FxUdERD6F
 [...]
+      "image/png": 
"iVBORw0KGgoAAAANSUhEUgAAApQAAAHzCAYAAACe1o1DAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuNSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/xnp5ZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABY10lEQVR4nO3dd3wUVdvG8WtmN4XQQgu910jvRUREQAEfC4gKKhZEAQGpdlHErqAi1YIICCgoSJOi8KCCtNB7kRp6hwRSduf9Iy95QAIk2Zlsyu/rhw+wm73PvQHJlTNzzjEsy7IEAAAApJLp7wYAAACQsREoAQAA4BMCJQAAAHxCoAQAAIBPCJQAAADwCYESAAAAPiFQAgAAwCcESgAAAPiEQAkAAACfECgBAADgEwIlAAAAfEKgBAAAgE8IlAAAAPAJgRIAAAA+IVACAADAJwRKAAAA+IRACQAAAJ8QKAEAAOAT
 [...]
       "text/plain": [
-       "<Figure size 432x288 with 1 Axes>"
+       "<Figure size 640x480 with 1 Axes>"
       ]
      },
      "metadata": {},
@@ -2864,7 +2811,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 17,
    "id": "fc862bb2",
    "metadata": {},
    "outputs": [
@@ -2874,7 +2821,7 @@
        "0.0"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2895,17 +2842,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 18,
    "id": "16b277a9",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "32.0"
+       "16.0"
       ]
      },
-     "execution_count": 17,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2916,17 +2863,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 19,
    "id": "fcdd9b0d",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "34.0"
+       "18.0"
       ]
      },
-     "execution_count": 18,
+     "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2937,17 +2884,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 20,
    "id": "740addd5",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "38.0"
+       "22.0"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2958,7 +2905,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 21,
    "id": "e090592f",
    "metadata": {},
    "outputs": [
@@ -2968,7 +2915,7 @@
        "2.0"
       ]
      },
-     "execution_count": 20,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2979,17 +2926,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 22,
    "id": "16ea3d74",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "18.0"
+       "8.0"
       ]
      },
-     "execution_count": 21,
+     "execution_count": 22,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3000,7 +2947,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 23,
    "id": "2876323b",
    "metadata": {},
    "outputs": [
@@ -3010,7 +2957,7 @@
        "4.0"
       ]
      },
-     "execution_count": 22,
+     "execution_count": 23,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3029,7 +2976,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 24,
    "id": "af75336a",
    "metadata": {},
    "outputs": [
@@ -3039,7 +2986,7 @@
        "21.333333333333332"
       ]
      },
-     "execution_count": 23,
+     "execution_count": 24,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3055,13 +3002,96 @@
    "source": [
     "This shows that the average of the edit distances between each of these 
session DiGraphs is 21.33."
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9f7e93b2",
+   "metadata": {},
+   "source": [
+    "## Feature Definition\n",
+    "A primitive that allows users to specify a rule or set of rules and an 
associated label and then use this to add labels to  logs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "id": "6097bf2b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import Any, Dict, List\n",
+    "import pandas as pd\n",
+    "\n",
+    "class FeatureDefinition:\n",
+    "    # Implement class logic\n",
+    "    # TODO: Add a very specific type hint to the rule object:\n",
+    "    # see: 
https://docs.python.org/3/library/typing.html#annotating-callable-objects\n";,
+    "    def __init__(self, label: str, rule: Dict[str, Any]):\n",
+    "        if not callable(rule):\n",
+    "            raise TypeError(\"Rule not callable\")\n",
+    "\n",
+    "        self.label = label\n",
+    "        self.__rule = rule\n",
+    "\n",
+    "    # This is a wrapper method around the private rule attribute we\n",
+    "    # store on self during init.\n",
+    "    #\n",
+    "    # Q: Why make the rule private and wrap the call to it in another 
method?\n",
+    "    # A: This encapsulation allows us to expose a nicer set of 
behavior\n",
+    "    # and naming conventions to both the user and ourselves as 
developers.\n",
+    "    # In `label_features` below, you see that we can then check 
whether\n",
+    "    # a log `matches` the definition which reads more like plain 
english\n",
+    "    # and is an important part of writing clean, idiomatic python 
code.\n",
+    "    # TODO: Implement this wrapper function by using the _rule 
attribute\n",
+    "    def matches(self, log: Dict[str, Any]) -> bool:\n",
+    "        if log == self.__rule:\n",
+    "            return log\n",
+    "\n",
+    "\n",
+    "def label_features(\n",
+    "    logs: List[Dict[str, Any]], definitions: List[FeatureDefinition]\n",
+    ") -> List[Dict[str, Any]]:\n",
+    "    # Iterate through all the logs\n",
+    "    for log in logs:\n",
+    "        for definition in definitions:\n",
+    "            if definition.matches(log):\n",
+    "                if \"labels\" not in log:\n",
+    "                    log.update({\"labels\": list()})\n",
+    "                log[\"labels\"].append(definition.label)\n",
+    "    return logs\n",
+    "\n",
+    "\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    json_file = input(\"Enter your json file path: \")\n",
+    "    logs = pd.read_json(json_file)\n",
+    "\n",
+    "    # Create a map rule to test out the FeatureDefinition with\n",
+    "    def map_rule(log: Dict[str, Any]) -> bool:\n",
+    "        return \"pageUrl\" in log and \"map\" in log[\"pageUrl\"]\n",
+    "    \n",
+    "    def container_rule(log: Dict[str, Any]) -> bool:\n",
+    "        return \"path\" in log and \"container\" in log[\"path\"]\n",
+    "    \n",
+    "    def table_rule(log: Dict[str, Any]) -> bool:\n",
+    "        return \"path\" in log and \"table\" in log[\"path\"]\n",
+    "\n",
+    "\n",
+    "    map_page_definition = FeatureDefinition(rule=map_rule, 
label=\"map_page\")\n",
+    "    container_path_definition = FeatureDefinition(rule=container_rule, 
label=\"container_path\")\n",
+    "    table_path_definition = FeatureDefinition(rule=table_rule, 
label=\"table_path\")\n",
+    "\n",
+    "    label_features(logs=logs, definitions=[map_page_definition])\n",
+    "    label_features(logs=logs, definitions=[container_path_definition])\n",
+    "    label_features(logs=logs, definitions=[table_path_definition])\n"
+   ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "distill",
    "language": "python",
-   "name": "python3"
+   "name": "distill"
   },
   "language_info": {
    "codemirror_mode": {
@@ -3073,7 +3103,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,
diff --git a/examples/labels.py b/examples/labels.py
new file mode 100644
index 0000000..9b55efb
--- /dev/null
+++ b/examples/labels.py
@@ -0,0 +1,91 @@
+import os
+
+import distill
+from tests import testing_utils
+from tests.data_config import DATA_DIR
+from typing import Any, Dict, List, Callable
+import json
+
+
+
+class FeatureDefinition:
+    # Implement class logic
+    def __init__(self, label: str, rule: Callable[[Dict[str, Any]], bool]):
+        # Immediately validate the rule, so you can error
+         # out/exit early if it's invalid
+        if not callable(rule):
+            raise TypeError("Rule not callable")
+        
+        if not isinstance(label, str):
+            raise TypeError("Label is not a string")
+
+        self.label = label
+        self._rule = rule
+
+    # This is a wrapper method around the private rule attribute we
+    # store on self during init.
+    #
+    # Q: Why make the rule private and
+    # wrap the call to it in another method?
+    # A: This encapsulation allows us to expose a nicer set of behavior
+    # and naming conventions to both the user and ourselves as developers.
+    # In `label_features` below, you see that we can then check whether
+    # a log `matches` the definition which reads more like plain english
+    # and is an important part of writing clean, idiomatic python code.
+    def matches(self, log: Dict[str, Any]) -> bool:
+        return self._rule(log)
+
+
+def label_features(
+    logs: List[Dict[str, Any]], definitions: List[FeatureDefinition]
+) -> List[Dict[str, Any]]:
+    # Iterate through all the logs
+    for log in logs:
+        # Check whether the log matches the definition
+        # for each definition supplied in the defintions list
+        for definition in definitions:
+            # NOTE: This reads much like an English sentence
+            # and is self-explanatory. I don't need to read the
+            # implementation logic to get a sense of what's happening
+            if definition.matches(log):
+                # NOTE: Since we're mutating the log itself and interacting
+                # with a field that may (does) not already exists, we need
+                # to first check if it is present in our log and instantiate
+                # it if not.
+                if "labels" not in log:
+                    log.update({"labels": list()})
+                log["labels"].append(definition.label)
+    return logs
+
+
+###########################################################
+# Example of how the FeatureDefintion class works
+#
+# The following if __name__ == "__main__" syntax
+# is a way to tell python that if your run this file
+# as a script from the command line, then this is the code
+# that needs to be executed.
+###########################################################
+if __name__ == "__main__":
+    file = open(os.path.join(DATA_DIR, "sample_data.json"), "r")
+    logs = json.load(file)
+
+    # Rule to test out the FeatureDefinition with
+    def type_rule(log):
+        return "type" in log and "scroll" in log["type"]
+
+    # Definitions to test out the FeatureDefinition with
+    type_rule_definition = FeatureDefinition(rule=type_rule, 
label="scroll_type")
+    rule_not_callable_definintion = FeatureDefinition(rule="rule", 
label="scroll_type")
+    string_error_definition = FeatureDefinition(rule=type_rule, label= 10)
+    
+    # Call label feature function to test the 3 definitions
+    label_features(logs=logs, definitions=[type_rule_definition])
+    label_features(logs=logs, definitions=[rule_not_callable_definintion])
+    label_features(logs=logs, definitions=[string_error_definition])
+
+
+ 
+
+
+
diff --git a/tests/test_transform.py b/tests/test_transform.py
index c49c282..ce9a713 100644
--- a/tests/test_transform.py
+++ b/tests/test_transform.py
@@ -14,7 +14,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+
 import distill
+from tests import testing_utils
+from tests.data_config import DATA_DIR
+import pytest
+import json
+from distill.core.feature_definition import FeatureDefinition
+from typing import Any, Dict, List, Callable
+from distill.process.transform import label_features
+
 
 
 def test_pairwiseStag_1():
@@ -39,3 +49,31 @@ def test_pairwiseSeq_2():
     test_list = [1, 2, 3, 4]
     result = distill.pairwiseSeq(test_list, split=True)
     assert result == ((1, 2, 3), (2, 3, 4))
+
+def test_label_features():
+    file = open(os.path.join(DATA_DIR, "sample_data.json"), "r")
+    logs = json.load(file)
+    def type_rule(log) -> bool:
+        return "type" in log and "scroll" in log["type"]
+    result = label_features(logs,[FeatureDefinition(rule=type_rule, 
label="scroll_type")])
+    assert isinstance(result, list)
+    assert "labels" in set().union(*result)
+    assert 'labels', 'scroll_type' in result[1].items()
+
+
+def test_feature_definition_does_not_accept_non_string_label():
+    with pytest.raises(TypeError):
+        file = open(os.path.join(DATA_DIR, "sample_data.json"), "r")
+        logs = json.load(file)
+        def input_rule(log):
+            return "target" in log and "input" in log["target"]
+        result = label_features(logs,[FeatureDefinition(rule=input_rule, 
label=10)])
+
+def test_feature_definition_does_not_accept_non_callable_rules():
+    with pytest.raises(TypeError):
+        file = open(os.path.join(DATA_DIR, "sample_data.json"), "r")
+        logs = json.load(file)
+        result = label_features(logs,[FeatureDefinition(rule="input_rule", 
label="input_target")])
+    
+
+


Reply via email to