EandrewJones commented on code in PR #33: URL: https://github.com/apache/flagon-distill/pull/33#discussion_r1507715709
########## distill/utils/crud.py: ########## @@ -47,3 +49,167 @@ def getUUID(log): ) else: return str(log["sessionID"]) + str(log["clientTime"]) + str(log["logType"]) + +def group_by_user(log): + """ + A helper function to create separate logs associated with unique users + where a unique user to is the browserSessionId + :param log: Userale log in the form of dictionary + :return: A dictionary that represent logs belonging to unique users + """ + grouped_data = {} + for d in log: + # Create a combination of the two key values userId and sessionID + sessionId = d.get('browserSessionId', '') + combined_key = str(sessionId) + if combined_key not in grouped_data: + grouped_data[combined_key] = [] + grouped_data[combined_key].append(d) + return grouped_data + +def chunk_by_idle_time(log, inactive_interval_s=60): + """ + This function will divide/chunk sets which clientTime is separated by + idle time where idle time is defined as period of inactivity that exceeds + the specified inactive_interval (in seconds) + By default, the interval is 60 seconds + :param log: Userale log in the form of dictionary + :param inactive_interval_s: Threshold of inactivity (no logged activity) in seconds + :return: A dictionary that represent sets separated by the idle time + """ + separated_sets = [] + current_set = [] + # Assume that clientTime is in the integer (unix time) which expressed in milliseconds + difference_in_ms = inactive_interval_s * 1000 + + # Initialize the current timestamp + if len(log) > 0: + if 'clientTime' in log[0]: + previous_timestamp = log[0]['clientTime'] + else: + previous_timestamp = log[0]['endTime'] + + for item in log: + if 'clientTime' in item: + current_timestamp = item['clientTime'] + else: + current_timestamp = item['endTime'] + if current_timestamp - previous_timestamp > difference_in_ms: + # If the current set is not empty, add it to the list of sets + if current_set: + separated_sets.append(current_set) + current_set = [] + + # Add the current item to the current set and update the previous timestamp + current_set.append(item) + previous_timestamp = current_timestamp + + # Add the last set if it's not empty + if current_set: + separated_sets.append(current_set) + + return separated_sets + +def chunk_by_tabId(log): + """ + Separate logs by their browserSessionId + :param log: Userale log in the form of dictionary + :return: A dictionary that represent sets separated by unique browserSessionId + """ + grouped_data = {} + for d in log: Review Comment: Please use a more informative name for the individual item being iterated on. The docstring indicates this is a singular log, so what's being iterated on here are the keys. Please do `for key in log` ########## tests/test_utils.py: ########## @@ -66,3 +66,25 @@ def test_UUID_without_type(): # Assert UID uniqueness assert len(data) == len(raw_data) assert len(data) == 19 + + +def test_chunk_to_usersessions(): Review Comment: For each of the user sessions functionalities here can you write a separate test? A unit test should only ever test a single behavior/case for a function. And the name of the test should be informative about what the behavior under test is. That way if the test fails, we know exactly which aspect of the function is failing from the test. ########## distill/utils/crud.py: ########## @@ -47,3 +49,167 @@ def getUUID(log): ) else: return str(log["sessionID"]) + str(log["clientTime"]) + str(log["logType"]) + +def group_by_user(log): + """ + A helper function to create separate logs associated with unique users + where a unique user to is the browserSessionId + :param log: Userale log in the form of dictionary + :return: A dictionary that represent logs belonging to unique users + """ + grouped_data = {} + for d in log: + # Create a combination of the two key values userId and sessionID + sessionId = d.get('browserSessionId', '') + combined_key = str(sessionId) + if combined_key not in grouped_data: + grouped_data[combined_key] = [] + grouped_data[combined_key].append(d) + return grouped_data + +def chunk_by_idle_time(log, inactive_interval_s=60): + """ + This function will divide/chunk sets which clientTime is separated by + idle time where idle time is defined as period of inactivity that exceeds + the specified inactive_interval (in seconds) + By default, the interval is 60 seconds + :param log: Userale log in the form of dictionary + :param inactive_interval_s: Threshold of inactivity (no logged activity) in seconds + :return: A dictionary that represent sets separated by the idle time + """ + separated_sets = [] + current_set = [] + # Assume that clientTime is in the integer (unix time) which expressed in milliseconds + difference_in_ms = inactive_interval_s * 1000 + + # Initialize the current timestamp + if len(log) > 0: + if 'clientTime' in log[0]: + previous_timestamp = log[0]['clientTime'] + else: + previous_timestamp = log[0]['endTime'] + + for item in log: + if 'clientTime' in item: + current_timestamp = item['clientTime'] + else: + current_timestamp = item['endTime'] + if current_timestamp - previous_timestamp > difference_in_ms: + # If the current set is not empty, add it to the list of sets + if current_set: + separated_sets.append(current_set) + current_set = [] + + # Add the current item to the current set and update the previous timestamp + current_set.append(item) + previous_timestamp = current_timestamp + + # Add the last set if it's not empty + if current_set: + separated_sets.append(current_set) + + return separated_sets + +def chunk_by_tabId(log): + """ + Separate logs by their browserSessionId + :param log: Userale log in the form of dictionary + :return: A dictionary that represent sets separated by unique browserSessionId + """ + grouped_data = {} + for d in log: + # Depending on the log types, tabID can be inside the details element + if 'browserSessionId' in d: + tab_key = 'tab_' + str(d['httpSessionId']) + else: + tab_key = 'unknown' + if tab_key not in grouped_data: + grouped_data[tab_key] = [] + grouped_data[tab_key].append(d) + return grouped_data + +def match_url(url, pattern): + # Escape dots in the pattern since dot is a special character in regex + # and replace '*' with '.*' to match any characters sequence + regex_pattern = re.escape(pattern).replace('\\*', '.*') + + # Add anchors to match the entire string + regex_pattern = '^' + regex_pattern + '$' + + # Compile the regex pattern + compiled_pattern = re.compile(regex_pattern) + + # Check if the URL matches the pattern + return bool(compiled_pattern.match(url)) + +def chunk_by_URL(log, re): + """ + Separate logs by the site that users interact with + :param log: Userale log in the form of dictionary + :param log: + :return: A dictionary that represent sets separated by unique browserSessionId + """ + grouped_data = {} + for d in log: Review Comment: same as a above ########## distill/utils/crud.py: ########## @@ -47,3 +49,167 @@ def getUUID(log): ) else: return str(log["sessionID"]) + str(log["clientTime"]) + str(log["logType"]) + +def group_by_user(log): + """ + A helper function to create separate logs associated with unique users + where a unique user to is the browserSessionId + :param log: Userale log in the form of dictionary + :return: A dictionary that represent logs belonging to unique users + """ + grouped_data = {} + for d in log: + # Create a combination of the two key values userId and sessionID + sessionId = d.get('browserSessionId', '') + combined_key = str(sessionId) + if combined_key not in grouped_data: + grouped_data[combined_key] = [] + grouped_data[combined_key].append(d) + return grouped_data + +def chunk_by_idle_time(log, inactive_interval_s=60): + """ + This function will divide/chunk sets which clientTime is separated by + idle time where idle time is defined as period of inactivity that exceeds + the specified inactive_interval (in seconds) + By default, the interval is 60 seconds + :param log: Userale log in the form of dictionary + :param inactive_interval_s: Threshold of inactivity (no logged activity) in seconds + :return: A dictionary that represent sets separated by the idle time + """ + separated_sets = [] + current_set = [] + # Assume that clientTime is in the integer (unix time) which expressed in milliseconds + difference_in_ms = inactive_interval_s * 1000 + + # Initialize the current timestamp + if len(log) > 0: + if 'clientTime' in log[0]: + previous_timestamp = log[0]['clientTime'] + else: + previous_timestamp = log[0]['endTime'] + + for item in log: + if 'clientTime' in item: + current_timestamp = item['clientTime'] + else: + current_timestamp = item['endTime'] + if current_timestamp - previous_timestamp > difference_in_ms: + # If the current set is not empty, add it to the list of sets + if current_set: + separated_sets.append(current_set) + current_set = [] + + # Add the current item to the current set and update the previous timestamp + current_set.append(item) + previous_timestamp = current_timestamp + + # Add the last set if it's not empty + if current_set: + separated_sets.append(current_set) + + return separated_sets + +def chunk_by_tabId(log): + """ + Separate logs by their browserSessionId + :param log: Userale log in the form of dictionary + :return: A dictionary that represent sets separated by unique browserSessionId + """ + grouped_data = {} + for d in log: + # Depending on the log types, tabID can be inside the details element + if 'browserSessionId' in d: + tab_key = 'tab_' + str(d['httpSessionId']) + else: + tab_key = 'unknown' + if tab_key not in grouped_data: + grouped_data[tab_key] = [] + grouped_data[tab_key].append(d) + return grouped_data + +def match_url(url, pattern): + # Escape dots in the pattern since dot is a special character in regex + # and replace '*' with '.*' to match any characters sequence + regex_pattern = re.escape(pattern).replace('\\*', '.*') + + # Add anchors to match the entire string + regex_pattern = '^' + regex_pattern + '$' + + # Compile the regex pattern + compiled_pattern = re.compile(regex_pattern) + + # Check if the URL matches the pattern + return bool(compiled_pattern.match(url)) + +def chunk_by_URL(log, re): + """ + Separate logs by the site that users interact with + :param log: Userale log in the form of dictionary + :param log: + :return: A dictionary that represent sets separated by unique browserSessionId + """ + grouped_data = {} + for d in log: + # Depending on the log types, tabID can be inside the details element + if 'pageUrl' in d: + domain = urlparse(d['pageUrl']).netloc + # Filter with the "re" parameter + if (re != "."): + if (match_url(domain, re)): + domain_key = re + else: + #Does not match, so we are skipping it + continue + else: + domain_key = domain + else: + domain_key = 'unknown' + + if domain_key not in grouped_data: + grouped_data[domain_key] = [] + grouped_data[domain_key].append(d) + return grouped_data + +def chunk_to_usersessions(log, inactive_interval_s = 60, group_by_type = "None", url_re = "."): Review Comment: please keep names consistent and snake cased. Change to `chunk_by_user_sessions` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: notifications-unsubscr...@flagon.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: notifications-unsubscr...@flagon.apache.org For additional commands, e-mail: notifications-h...@flagon.apache.org