pabloem commented on code in PR #35922: URL: https://github.com/apache/beam/pull/35922#discussion_r2292575421
########## infra/security/log_analyzer.py: ########## @@ -0,0 +1,277 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import ssl +import yaml +import logging +import smtplib +import os +from dataclasses import dataclass +from datetime import datetime, timedelta +from google.cloud import logging_v2 +from google.cloud import storage +from typing import List, Dict, Any +import argparse + +REPORT_SUBJECT = "Weekly IAM Security Events Report" +REPORT_BODY_TEMPLATE = """ +Hello Team, + +Please find below the summary of IAM security events for the past week: + +{event_summary} + +Best Regards, +Automated GitHub Action +""" + +@dataclass +class Sink: + name: str + description: str + filter_methods: List[str] + excluded_principals: List[str] + +class LogAnalyzer(): + def __init__(self, project_id: str, gcp_bucket: str, logger: logging.Logger, sinks: List[Sink]): + self.project_id = project_id + self.bucket = gcp_bucket + self.logger = logger + self.sinks = sinks + + def _construct_filter(self, sink: Sink) -> str: + """ + Constructs a filter string for a given sink. + + Args: + sink (Sink): The sink object containing filter information. + + Returns: + str: The constructed filter string. + """ + + method_filters = [] + for method in sink.filter_methods: + method_filters.append(f'protoPayload.methodName="{method}"') + + exclusion_filters = [] + for principal in sink.excluded_principals: + exclusion_filters.append(f'protoPayload.authenticationInfo.principalEmail != "{principal}"') + + if method_filters and exclusion_filters: + filter_ = f"({' OR '.join(method_filters)}) AND ({' AND '.join(exclusion_filters)})" + elif method_filters: + filter_ = f"({' OR '.join(method_filters)})" + elif exclusion_filters: + filter_ = f"({' AND '.join(exclusion_filters)})" + else: + filter_ = "" + + return filter_ + + def _create_log_sink(self, sink: Sink) -> None: + """ + Creates a log sink in GCP if it doesn't already exist. + If it already exists, it updates the sink with the new filter in case the filter has changed. + + Args: + sink (Sink): The sink object to create. + """ + logging_client = logging_v2.Client(project=self.project_id) + filter_ = self._construct_filter(sink) + destination = "storage.googleapis.com/{bucket}".format(bucket=self.bucket) + + new_sink = logging_client.sink(sink.name, filter_=filter_, destination=destination) + + if new_sink.exists(): + self.logger.debug(f"Sink {sink.name} already exists.") + old_sink = logging_client.sink(sink.name) + old_sink.reload() + if old_sink.filter_ != filter_: + old_sink.filter_ = filter_ + old_sink.update() + self.logger.info(f"Updated sink {sink.name}'s filter.") + else: + new_sink.create() + self.logger.info(f"Created sink {sink.name}.") + + logging_client.close() + + def initialize_sinks(self) -> None: + for sink in self.sinks: + self._create_log_sink(sink) + self.logger.info(f"Initialized sink: {sink.name}") + + def get_event_logs(self, days: int = 7) -> List[Dict[str, Any]]: + """ + Reads and retrieves log events from the specified time range from the GCP Cloud Storage bucket. + + Args: + days (int): The number of days to look back for log analysis. + + Returns: + List[Dict[str, Any]]: A list of log entries that match the specified time range. + """ + found_events = [] + storage_client = storage.Client(project=self.project_id) + + blobs = storage_client.list_blobs(self.bucket) + days_ago = datetime.now() - timedelta(days=days) Review Comment: let's make sure that we have a proper time window to check for new files. Because `now()` is an arbitrary number - let's do: - Created before `datetime.datetime(now..year, now.month, now.day, now.hour, 0) - timedelta(minutes=30)` - Created after `datetime.datetime(now..year, now.month, now.day, now.hour, 0) - timedelta(days=7, minutes=30)` i.e. 8.30-8.30 ea ch week -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@beam.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org