(flagon-distill) branch master updated: 48-interval-schema (#50)

ejones Mon, 15 Jul 2024 09:45:09 -0700

This is an automated email from the ASF dual-hosted git repository.

ejones pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/flagon-distill.git



The following commit(s) were added to refs/heads/master by this push:
     new 5048552  48-interval-schema (#50)
5048552 is described below

commit 5048552df32d1177bb4f58864cf6e1431ac1a260
Author: Ryan Thenhaus <73962982+rc10ho...@users.noreply.github.com>
AuthorDate: Mon Jul 15 11:44:59 2024 -0500

    48-interval-schema (#50)
    
    * Added schemas for raw and interval userale logs
    
    * Added polymorphic UserAleSchema and some tests
    
    * Small changes based on review
    
    ---------
    
    Co-authored-by: jlhitzeman <jack.hitze...@gmail.com>
    Co-authored-by: rthenhaus <rthenhaus>
---
 .pre-commit-config.yaml           |  8 ++--
 distill/core/log.py               | 30 +++++++------
 distill/core/types.py             | 25 ++++++-----
 distill/schemas/userale.py        | 89 ++++++++++++++++++++++++++++++---------
 tests/data/log_interval_data.json |  1 +
 tests/data/log_test_data.json     |  2 +-
 tests/test_log.py                 | 30 ++++++++++---
 7 files changed, 129 insertions(+), 56 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1066640..5b79536 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -30,20 +30,20 @@ repos:
       - id: end-of-file-fixer
       - id: mixed-line-ending
     repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
+    rev: v4.6.0
   - repo: https://github.com/commitizen-tools/commitizen
-    rev: v3.18.0 # automatically updated by Commitizen
+    rev: v3.27.0 # automatically updated by Commitizen
     hooks:
       - id: commitizen
         stages: [commit-msg]
   - hooks:
       - id: flake8
     repo: https://github.com/pycqa/flake8
-    rev: 7.0.0
+    rev: 7.1.0
   - hooks:
       - id: black
     repo: https://github.com/psf/black
-    rev: 24.2.0
+    rev: 24.4.2
   - hooks:
       - args:
           - --profile
diff --git a/distill/core/log.py b/distill/core/log.py
index 0bfd667..e8c1f7a 100644
--- a/distill/core/log.py
+++ b/distill/core/log.py
@@ -14,17 +14,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import json
-
-from pydantic import BaseModel
-from pydantic.type_adapter import TypeAdapter
 from typing import Dict, Union
+
 from pksuid import PKSUID
+from pydantic import BaseModel, parse_obj_as
+from pydantic.type_adapter import TypeAdapter
 
-from distill.core.types import JsonDict, JSONSerializable
-from distill.schemas.userale import UserAleSchema
+from distill.core.types import JsonDict, JSONSerializable, UserAleSchema
 
 ta = TypeAdapter(JsonDict)
 
+
 class Log:
     """
     Base class for log object representation.
@@ -34,27 +34,29 @@ class Log:
                 defaults to UserAle log schema
     """
 
-    def __init__(self, data: Union[str, JsonDict], schema=UserAleSchema):
-        if not issubclass(schema, BaseModel):
+    def __init__(self, data: Union[str, JsonDict], schema=None):
+        if schema is None:
+            schema = UserAleSchema
+        elif issubclass(schema, BaseModel):
             raise TypeError("schema should inherit from pydantic.BaseModel")
 
         if isinstance(data, str):
-            schema.model_validate_json(data, strict=True)
             hash_sfx = str(hash(data))
             data = json.loads(data)
         elif ta.validate_python(data):
             hash_sfx = str(hash(json.dumps(data)))
-            schema.model_validate(data, strict=True)
         else:
-            raise TypeError("ERROR: " + str(type(data)) + " data should be 
either a string or a JsonDict")
-        self.data = schema(**data)
-
-        self.id = PKSUID("log_" + hash_sfx, schema._timestamp(self.data))
+            raise TypeError(
+                "ERROR: "
+                + str(type(data))
+                + " data should be either a string or a JsonDict"
+            )
+        self.data = schema.validate_python(data)
 
+        self.id = PKSUID("log_" + hash_sfx, self.data._timestamp())
 
     def to_json(self) -> str:
         return self.data.model_dump_json(by_alias=True)
 
     def to_dict(self) -> JsonDict:
         return self.data.model_dump(by_alias=True)
-
diff --git a/distill/core/types.py b/distill/core/types.py
index 1d1fdbe..af72560 100644
--- a/distill/core/types.py
+++ b/distill/core/types.py
@@ -1,22 +1,27 @@
-from typing import Union, List, Dict
+from typing import Dict, List, Union
+
+from pydantic.type_adapter import TypeAdapter
 from typing_extensions import TypeAliasType
 
+from distill.schemas.userale import UserAleRawSchema, UserAleIntervalSchema
+
 # TypeAliasType is necessary to avoid recursion error when validating this
 # type with Pydantic
 JSONSerializable = TypeAliasType(
     "JSONSerializable",
-    Union[str,
-        int, 
-        float, 
-        bool, 
-        None, 
-        List['JSONSerializable'],
-        Dict[str, 'JSONSerializable']
+    Union[
+        str,
+        int,
+        float,
+        bool,
+        None,
+        List["JSONSerializable"],
+        Dict[str, "JSONSerializable"],
     ],
 )
 
-JsonDict = Dict[str, 'JSONSerializable']
+JsonDict = Dict[str, "JSONSerializable"]
 
 Timestamp = Union[str, int, float]
 
-
+UserAleSchema = TypeAdapter(Union[UserAleRawSchema, UserAleIntervalSchema])
diff --git a/distill/schemas/userale.py b/distill/schemas/userale.py
index 133ab0d..cc6ca02 100644
--- a/distill/schemas/userale.py
+++ b/distill/schemas/userale.py
@@ -13,22 +13,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import List, Optional
 from datetime import datetime
+from typing import List, Optional
 
-from pydantic import AliasGenerator, BaseModel, Field, field_serializer, 
field_validator
+from pydantic import BaseModel, Field, AliasGenerator, field_serializer, 
field_validator
 from pydantic.alias_generators import to_camel
 from pydantic.config import ConfigDict
 
-from .base import BaseSchema
-from datetime import datetime
-
+from distill.schemas.base import BaseSchema
 
 class Browser(BaseModel):
     browser: str
     version: str
 
-
 class Location(BaseModel):
     x: Optional[int]
     y: Optional[int]
@@ -42,38 +39,90 @@ class ScrnRes(BaseModel):
 class Details(BaseModel):
     window: bool
 
-
-class UserAleSchema(BaseSchema):
+class UserAleBaseSchema(BaseSchema):
     """
     A raw or custom log produced by UserAle
     """
 
-    model_config = ConfigDict(
-        title="Log",
-        alias_generator=AliasGenerator(
-            validation_alias=to_camel, serialization_alias=to_camel
-        ),
-    )
-
     target: str
     path: List[str]
     page_url: str
     page_title: str
     page_referrer: str
     browser: Browser
-    client_time: int 
-    micro_time: int = Field(..., lt=2)
-    location: Location
-    scrn_res: ScrnRes
     type_field: str = Field(..., validation_alias="type", 
serialization_alias="type")
     log_type: str
     user_action: bool
-    details: Details
     user_id: str
     tool_version: Optional[str]
     tool_name: Optional[str]
     userale_version: Optional[str]
     session_id: str
+    http_session_id: str
+    browser_session_id: str
+
+    def _timestamp(self):
+        """
+        Returns:
+            float: POSIX time from userALE log's client_time field
+        """
+        pass
+
+
+class UserAleIntervalSchema(UserAleBaseSchema):
+    """
+    A raw or custom log produced by UserAle
+    """
+
+    model_config = ConfigDict(
+        title="IntervalLog",
+        alias_generator=AliasGenerator(
+            validation_alias=to_camel, serialization_alias=to_camel
+        ),
+    )
+
+    count: int
+    duration: int
+    start_time: int
+    end_time: int
+    target_change: bool
+    type_change: bool
+
+    @field_validator("start_time", "end_time")
+    def validate_st(cls, st: float):
+        return datetime.fromtimestamp(st / 1000)
+
+    @field_serializer("start_time", "end_time")
+    def serialize_st(self, st: datetime):
+        return int(st.timestamp() * 1000)
+
+    # add in end_time validator and serializer under same tag
+
+    def _timestamp(self):
+        """
+        Returns:
+            float: POSIX time from userALE log's start_time field
+        """
+        return self.start_time.timestamp()
+
+
+class UserAleRawSchema(UserAleBaseSchema):
+    """
+    A raw or custom log produced by UserAle
+    """
+
+    model_config = ConfigDict(
+        title="RawLog",
+        alias_generator=AliasGenerator(
+            validation_alias=to_camel, serialization_alias=to_camel
+        ),
+    )
+
+    client_time: int
+    micro_time: int = Field(..., lt=2)
+    location: Location
+    scrn_res: ScrnRes
+    details: Details
 
     @field_validator("client_time")
     def validate_ct(cls, ct: float):
diff --git a/tests/data/log_interval_data.json 
b/tests/data/log_interval_data.json
new file mode 100644
index 0000000..0ee8635
--- /dev/null
+++ b/tests/data/log_interval_data.json
@@ -0,0 +1 @@
+{"target": "nav.navigation-bar-desktop", "path": 
["nav.navigation-bar-desktop","body.body","html.no-js","#document","Window"], 
"pageUrl": "https://beam.apache.org/case-studies/";, "pageTitle": "Case 
Studies", "pageReferrer": "https://beam.apache.org/";, "browser": {"browser": 
"chrome","version": "114.0.0"}, "count": 1, "duration": 129, "startTime": 
1708447014463, "endTime": 1708447014592, "type": "mouseover", "logType": 
"interval", "targetChange": true, "typeChange": false, "userAction": f [...]
diff --git a/tests/data/log_test_data.json b/tests/data/log_test_data.json
index 2e10ad6..0831911 100644
--- a/tests/data/log_test_data.json
+++ b/tests/data/log_test_data.json
@@ -1 +1 @@
-{"target": "#document","path": [ "Window" ], "pageUrl": 
"https://github.com/apache/flagon/tree/master/docker";, "pageTitle": 
"flagon/docker at master · apache/flagon · GitHub", "pageReferrer": 
"https://gov.teams.microsoft.us/";, "browser": { "browser": "chrome", "version": 
"116.0.0" }, "clientTime": 1719530111079, "microTime": 0,"location": { "x": 
null, "y": null }, "scrnRes": { "width": 1349, "height":954 }, "type": "load", 
"logType": "raw", "userAction": true, "details": {"window": true  [...]
+{"target": "#document","path": [ "Window" ], "pageUrl": 
"https://github.com/apache/flagon/tree/master/docker";, "pageTitle": 
"flagon/docker at master · apache/flagon · GitHub", "pageReferrer": 
"https://gov.teams.microsoft.us/";, "browser": { "browser": "chrome", "version": 
"116.0.0" }, "clientTime": 1719530111079, "microTime": 0,"location": { "x": 
null, "y": null }, "scrnRes": { "width": 1349, "height":954 }, "type": "load", 
"logType": "raw", "userAction": true, "details": {"window": true  [...]
diff --git a/tests/test_log.py b/tests/test_log.py
index 53b2767..fc68de0 100644
--- a/tests/test_log.py
+++ b/tests/test_log.py
@@ -16,18 +16,18 @@
 
 import json
 import os
+from datetime import datetime
 
 from pydantic import ValidationError
 
-from distill.core.log import Log 
+from distill.core.log import Log
 from tests.data_config import DATA_DIR
-from datetime import datetime
 
 
 def test_log_constructor():
     exception_thrown = False
     try:
-        _ = Log(data="garbage data")
+        _ = Log(data='{"garbage data": "bad"}')
     except ValidationError:
         exception_thrown = True
     assert exception_thrown == True
@@ -48,14 +48,24 @@ def test_log_constructor():
     assert id.get_timestamp() == 1719530111079 // 1000
     assert id.prefix.startswith("log_")
 
+    data = load_interval_log()
+    test_interval_log = Log(data=data)
+    assert test_interval_log is not None
+    id = test_interval_log.id
+    assert id.get_timestamp() == 1708447014463 // 1000
+
 
 def test_log_serialize():
     data = load_log()
     test_log = Log(data=data)
 
-    correct_str = json.dumps(
-        json.loads(data), separators=(",", ":"), ensure_ascii=False
-    )
+    # correct_str = json.dumps(
+    # json.loads(data), separators=(",", ":"), ensure_ascii=False
+    # )
+    # Hardcoding this for now because creating a polymorphic model does not
+    # preserve order in pydantic. Our data is still correct but not in the
+    # original order. There doesn't seem to be an easy way to fix this right 
now
+    correct_str = 
'{"target":"#document","path":["Window"],"pageUrl":"https://github.com/apache/flagon/tree/master/docker","pageTitle":"flagon/docker
 at master · apache/flagon · 
GitHub","pageReferrer":"https://gov.teams.microsoft.us/","browser":{"browser":"chrome","version":"116.0.0"},"type":"load","logType":"raw","userAction":true,"userId":"nobody","toolVersion":null,"toolName":"test_app","useraleVersion":"2.3.0","sessionId":"session_1719530074303","httpSessionId":"72798a8ad776417183b1a
 [...]
     serialized_data = test_log.to_json()
     assert serialized_data == correct_str
 
@@ -73,7 +83,7 @@ def test_log_normalize_timestamp():
     data = load_log()
     test_log = Log(data=data)
 
-    # note provided UserAle schema has clientTime in milliseconds but need it 
in 
+    # note provided UserAle schema has clientTime in milliseconds but need it 
in
     # seconds to be able to parse
     correct_ms = 1719530111079
     correct_dt = datetime.fromtimestamp(correct_ms / 1000)
@@ -86,3 +96,9 @@ def load_log() -> str:
     with open(os.path.join(DATA_DIR, "log_test_data.json")) as f:
         data = f.readline()
     return data
+
+
+def load_interval_log() -> str:
+    with open(os.path.join(DATA_DIR, "log_interval_data.json")) as f:
+        data = f.readline()
+    return data

(flagon-distill) branch master updated: 48-interval-schema (#50)

Reply via email to