123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317 |
- """
- CLI commands for monitoring functionality.
- """
- from __future__ import annotations
- from datetime import datetime, timedelta, timezone
- import click
- from flask import current_app as app
- from flask.cli import with_appcontext
- from flask_mail import Message
- from sentry_sdk import (
- capture_message as capture_message_for_sentry,
- set_context as set_sentry_context,
- )
- from sqlalchemy import select
- from tabulate import tabulate
- from flexmeasures.data import db
- from flexmeasures.data.models.task_runs import LatestTaskRun
- from flexmeasures.data.models.user import User
- from flexmeasures.utils.time_utils import server_now
- from flexmeasures.cli.utils import MsgStyle
- @click.group("monitor")
- def fm_monitor():
- """FlexMeasures: Monitor tasks."""
- def send_task_monitoring_alert(
- task_name: str,
- msg: str,
- latest_run: LatestTaskRun | None = None,
- custom_msg: str | None = None,
- ):
- """
- Send any monitoring message per Sentry and per email. Also log an error.
- """
- latest_run_txt = ""
- if latest_run:
- set_sentry_context(
- "latest_run", {"time": latest_run.datetime, "status": latest_run.status}
- )
- latest_run_txt = (
- f"Last run was at {latest_run.datetime}, status was: {latest_run.status}"
- )
- custom_msg_txt = ""
- if custom_msg:
- custom_msg_txt = f"\n\nNote: {custom_msg}"
- capture_message_for_sentry(msg)
- email_recipients = app.config.get("FLEXMEASURES_MONITORING_MAIL_RECIPIENTS", [])
- if len(email_recipients) > 0:
- email = Message(subject=f"Problem with task {task_name}", bcc=email_recipients)
- email.body = (
- f"{msg}\n\n{latest_run_txt}\nWe suggest to check the logs.{custom_msg_txt}"
- )
- app.mail.send(email)
- app.logger.error(f"{msg} {latest_run_txt} NOTE: {custom_msg}")
- @fm_monitor.command("latest-run")
- @with_appcontext
- @click.option(
- "--task",
- type=(str, int),
- multiple=True,
- required=True,
- help="The name of the task and the maximal allowed minutes between successful runs. Use multiple times if needed.",
- )
- @click.option(
- "--custom-message",
- type=str,
- default="",
- help="Add this message to the monitoring alert (if one is sent).",
- )
- def monitor_latest_run(task, custom_message):
- """
- Check if the given task's last successful execution happened less than the allowed time ago.
- Tasks are CLI commands with the @task_with_status_report decorator.
- If not, alert someone, via email or sentry.
- """
- for t in task:
- task_name = t[0]
- app.logger.info(f"Checking latest run of task {task_name} ...")
- latest_run: LatestTaskRun = db.session.get(LatestTaskRun, task_name)
- if latest_run is None:
- msg = f"Task {task_name} has no last run and thus cannot be monitored. Is it configured properly?"
- send_task_monitoring_alert(task_name, msg, custom_msg=custom_message)
- raise click.Abort()
- now = server_now()
- acceptable_interval = timedelta(minutes=t[1])
- # check if latest run was recently enough
- if latest_run.datetime >= now - acceptable_interval:
- # latest run time is okay, let's check the status
- if latest_run.status is False:
- msg = f"A failure has been reported on task {task_name}."
- send_task_monitoring_alert(
- task_name, msg, latest_run=latest_run, custom_msg=custom_message
- )
- else:
- msg = (
- f"Task {task_name}'s latest run time is outside of the acceptable range"
- f" ({acceptable_interval})."
- )
- send_task_monitoring_alert(
- task_name, msg, latest_run=latest_run, custom_msg=custom_message
- )
- app.logger.info("Done checking task runs ...")
- def send_lastseen_monitoring_alert(
- users: list[User],
- last_seen_delta: timedelta,
- alerted_users: bool,
- account_role: str | None = None,
- user_role: str | None = None,
- txt_about_already_alerted_users: str = "",
- ):
- """
- Tell monitoring recipients and Sentry about user(s) we haven't seen in a while.
- """
- user_info = [
- [user.username, user.last_seen_at.strftime("%d %b %Y %I:%M:%S %p")]
- for user in users
- ]
- msg = (
- f"The following user(s) have not contacted this FlexMeasures server for more"
- f" than {last_seen_delta}, even though we expect they would have:\n\n"
- )
- msg += tabulate(user_info, headers=["User", "Last contact"])
- # Sentry
- set_sentry_context(
- "last_seen_context",
- {
- "delta": last_seen_delta,
- "alerted_users": alerted_users,
- "account_role": account_role,
- "user_role": user_role,
- },
- )
- capture_message_for_sentry(msg)
- # Email
- msg += "\n"
- if account_role:
- msg += f"\nThis alert concerns users whose accounts have the role '{account_role}'."
- if user_role:
- msg += f"\nThis alert concerns users who have the role '{user_role}'."
- if txt_about_already_alerted_users:
- msg += f"\n{txt_about_already_alerted_users}"
- if alerted_users:
- msg += "\n\nThe user(s) has/have been notified by email, as well."
- else:
- msg += (
- "\n\nThe user(s) has/have not been notified (--alert-users was not used)."
- )
- email_recipients = app.config.get("FLEXMEASURES_MONITORING_MAIL_RECIPIENTS", [])
- if len(email_recipients) > 0:
- email = Message(
- subject="Last contact by user(s) too long ago", bcc=email_recipients
- )
- email.body = msg
- app.mail.send(email)
- app.logger.error(msg)
- @fm_monitor.command("last-seen")
- @with_appcontext
- @click.option(
- "--maximum-minutes-since-last-seen",
- type=int,
- required=True,
- help="Maximal number of minutes since last request.",
- )
- @click.option(
- "--alert-users/--do-not-alert-users",
- type=bool,
- default=False,
- help="If True, also send an email to the user. Defaults to False, as these users are often bots.",
- )
- @click.option(
- "--account-role",
- type=str,
- help="The name of an account role to filter for.",
- )
- @click.option(
- "--user-role",
- type=str,
- help="The name of a user role to filter for.",
- )
- @click.option(
- "--custom-user-message",
- type=str,
- default="",
- help="Add this message to the monitoring alert email to users (if one is sent).",
- )
- @click.option(
- "--only-newly-absent-users/--all-absent-users",
- type=bool,
- default=True,
- help="If True, a user is only included in this alert once after they were absent for too long. Defaults to True, so as to keep regular emails to low volume with newsworthy alerts.",
- )
- @click.option(
- "--task-name",
- type=str,
- default="monitor-last-seen-users",
- help="Optional name of the task, to distinguish finding out when the last monitoring happened (see --only-newly-absent-users).",
- )
- def monitor_last_seen(
- maximum_minutes_since_last_seen: int,
- alert_users: bool = False,
- account_role: str | None = None,
- user_role: str | None = None,
- custom_user_message: str | None = None,
- only_newly_absent_users: bool = True,
- task_name: str = "monitor-last-seen-users",
- ):
- """
- Check if given users last contact (via a request) happened less than the allowed time ago.
- Helpful for user accounts that are expected to contact FlexMeasures regularly (in an automated fashion).
- If the last contact was too long ago, we send alerts via Sentry, as well as emails to monitoring mail recipients.
- The user can be informed, as well.
- The set of users can be narrowed down by roles.
- Per default, this function will only alert you once per absent user (to avoid information overload).
- To (still) keep an overview over all absentees, we recommend to run this command in short regular intervals as-is
- and with --all-absent-users once per longer interval (e.g. per 24h).
- If you run distinct filters, you can use distinct task names, so the --only-newly-absent-users feature
- will work for all filters independently.
- """
- last_seen_delta = timedelta(minutes=maximum_minutes_since_last_seen)
- latest_run: LatestTaskRun = db.session.get(LatestTaskRun, task_name)
- # find users we haven't seen in the given time window (last_seen_at is naive UTC)
- users: list[User] = db.session.scalars(
- select(User).filter(
- User.last_seen_at < datetime.now(timezone.utc) - last_seen_delta
- )
- ).all()
- # role filters
- if account_role is not None:
- users = [user for user in users if user.account.has_role(account_role)]
- if user_role is not None:
- users = [user for user in users if user.has_role(user_role)]
- # filter out users who we already included in this check's last run
- txt_about_already_alerted_users = ""
- if only_newly_absent_users and latest_run:
- original_length = len(users)
- users = [
- user
- for user in users
- if user.last_seen_at.replace(tzinfo=timezone.utc) + last_seen_delta
- > latest_run.datetime
- ]
- if len(users) < original_length:
- txt_about_already_alerted_users = "There are (also) users who have been absent long, but one of the earlier monitoring runs already included them (run monitoring with --include-all-users-each-run to see them)."
- if not users:
- click.secho(
- f"All good ― no users were found with relevant criteria and last_seen_at longer than {maximum_minutes_since_last_seen} minutes ago. {txt_about_already_alerted_users}",
- **MsgStyle.SUCCESS,
- )
- raise click.Abort()
- # inform users & monitoring recipients
- if alert_users:
- for user in users:
- msg = (
- f"We noticed that user {user.username} has not been in contact with this FlexMeasures server"
- f" for at least {maximum_minutes_since_last_seen} minutes (last contact was {user.last_seen_at})."
- )
- if custom_user_message:
- msg += f"\n\n{custom_user_message}"
- else:
- msg += (
- "\nBy our own accounting, this should usually not happen."
- "\n\nMaybe you want to check if your local code is still working well."
- )
- email = Message(
- subject=f"Last contact by user {user.username} has been too long ago",
- recipients=[user.email],
- )
- email.body = msg
- app.mail.send(email)
- else:
- click.secho("Users are not being alerted.", **MsgStyle.ERROR)
- send_lastseen_monitoring_alert(
- users,
- last_seen_delta,
- alerted_users=alert_users,
- account_role=account_role,
- user_role=user_role,
- txt_about_already_alerted_users=txt_about_already_alerted_users,
- )
- # remember that we checked at this time
- LatestTaskRun.record_run(task_name, True)
- db.session.commit()
- app.cli.add_command(fm_monitor)
|