monitor.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317
  1. """
  2. CLI commands for monitoring functionality.
  3. """
  4. from __future__ import annotations
  5. from datetime import datetime, timedelta, timezone
  6. import click
  7. from flask import current_app as app
  8. from flask.cli import with_appcontext
  9. from flask_mail import Message
  10. from sentry_sdk import (
  11. capture_message as capture_message_for_sentry,
  12. set_context as set_sentry_context,
  13. )
  14. from sqlalchemy import select
  15. from tabulate import tabulate
  16. from flexmeasures.data import db
  17. from flexmeasures.data.models.task_runs import LatestTaskRun
  18. from flexmeasures.data.models.user import User
  19. from flexmeasures.utils.time_utils import server_now
  20. from flexmeasures.cli.utils import MsgStyle
  21. @click.group("monitor")
  22. def fm_monitor():
  23. """FlexMeasures: Monitor tasks."""
  24. def send_task_monitoring_alert(
  25. task_name: str,
  26. msg: str,
  27. latest_run: LatestTaskRun | None = None,
  28. custom_msg: str | None = None,
  29. ):
  30. """
  31. Send any monitoring message per Sentry and per email. Also log an error.
  32. """
  33. latest_run_txt = ""
  34. if latest_run:
  35. set_sentry_context(
  36. "latest_run", {"time": latest_run.datetime, "status": latest_run.status}
  37. )
  38. latest_run_txt = (
  39. f"Last run was at {latest_run.datetime}, status was: {latest_run.status}"
  40. )
  41. custom_msg_txt = ""
  42. if custom_msg:
  43. custom_msg_txt = f"\n\nNote: {custom_msg}"
  44. capture_message_for_sentry(msg)
  45. email_recipients = app.config.get("FLEXMEASURES_MONITORING_MAIL_RECIPIENTS", [])
  46. if len(email_recipients) > 0:
  47. email = Message(subject=f"Problem with task {task_name}", bcc=email_recipients)
  48. email.body = (
  49. f"{msg}\n\n{latest_run_txt}\nWe suggest to check the logs.{custom_msg_txt}"
  50. )
  51. app.mail.send(email)
  52. app.logger.error(f"{msg} {latest_run_txt} NOTE: {custom_msg}")
  53. @fm_monitor.command("latest-run")
  54. @with_appcontext
  55. @click.option(
  56. "--task",
  57. type=(str, int),
  58. multiple=True,
  59. required=True,
  60. help="The name of the task and the maximal allowed minutes between successful runs. Use multiple times if needed.",
  61. )
  62. @click.option(
  63. "--custom-message",
  64. type=str,
  65. default="",
  66. help="Add this message to the monitoring alert (if one is sent).",
  67. )
  68. def monitor_latest_run(task, custom_message):
  69. """
  70. Check if the given task's last successful execution happened less than the allowed time ago.
  71. Tasks are CLI commands with the @task_with_status_report decorator.
  72. If not, alert someone, via email or sentry.
  73. """
  74. for t in task:
  75. task_name = t[0]
  76. app.logger.info(f"Checking latest run of task {task_name} ...")
  77. latest_run: LatestTaskRun = db.session.get(LatestTaskRun, task_name)
  78. if latest_run is None:
  79. msg = f"Task {task_name} has no last run and thus cannot be monitored. Is it configured properly?"
  80. send_task_monitoring_alert(task_name, msg, custom_msg=custom_message)
  81. raise click.Abort()
  82. now = server_now()
  83. acceptable_interval = timedelta(minutes=t[1])
  84. # check if latest run was recently enough
  85. if latest_run.datetime >= now - acceptable_interval:
  86. # latest run time is okay, let's check the status
  87. if latest_run.status is False:
  88. msg = f"A failure has been reported on task {task_name}."
  89. send_task_monitoring_alert(
  90. task_name, msg, latest_run=latest_run, custom_msg=custom_message
  91. )
  92. else:
  93. msg = (
  94. f"Task {task_name}'s latest run time is outside of the acceptable range"
  95. f" ({acceptable_interval})."
  96. )
  97. send_task_monitoring_alert(
  98. task_name, msg, latest_run=latest_run, custom_msg=custom_message
  99. )
  100. app.logger.info("Done checking task runs ...")
  101. def send_lastseen_monitoring_alert(
  102. users: list[User],
  103. last_seen_delta: timedelta,
  104. alerted_users: bool,
  105. account_role: str | None = None,
  106. user_role: str | None = None,
  107. txt_about_already_alerted_users: str = "",
  108. ):
  109. """
  110. Tell monitoring recipients and Sentry about user(s) we haven't seen in a while.
  111. """
  112. user_info = [
  113. [user.username, user.last_seen_at.strftime("%d %b %Y %I:%M:%S %p")]
  114. for user in users
  115. ]
  116. msg = (
  117. f"The following user(s) have not contacted this FlexMeasures server for more"
  118. f" than {last_seen_delta}, even though we expect they would have:\n\n"
  119. )
  120. msg += tabulate(user_info, headers=["User", "Last contact"])
  121. # Sentry
  122. set_sentry_context(
  123. "last_seen_context",
  124. {
  125. "delta": last_seen_delta,
  126. "alerted_users": alerted_users,
  127. "account_role": account_role,
  128. "user_role": user_role,
  129. },
  130. )
  131. capture_message_for_sentry(msg)
  132. # Email
  133. msg += "\n"
  134. if account_role:
  135. msg += f"\nThis alert concerns users whose accounts have the role '{account_role}'."
  136. if user_role:
  137. msg += f"\nThis alert concerns users who have the role '{user_role}'."
  138. if txt_about_already_alerted_users:
  139. msg += f"\n{txt_about_already_alerted_users}"
  140. if alerted_users:
  141. msg += "\n\nThe user(s) has/have been notified by email, as well."
  142. else:
  143. msg += (
  144. "\n\nThe user(s) has/have not been notified (--alert-users was not used)."
  145. )
  146. email_recipients = app.config.get("FLEXMEASURES_MONITORING_MAIL_RECIPIENTS", [])
  147. if len(email_recipients) > 0:
  148. email = Message(
  149. subject="Last contact by user(s) too long ago", bcc=email_recipients
  150. )
  151. email.body = msg
  152. app.mail.send(email)
  153. app.logger.error(msg)
  154. @fm_monitor.command("last-seen")
  155. @with_appcontext
  156. @click.option(
  157. "--maximum-minutes-since-last-seen",
  158. type=int,
  159. required=True,
  160. help="Maximal number of minutes since last request.",
  161. )
  162. @click.option(
  163. "--alert-users/--do-not-alert-users",
  164. type=bool,
  165. default=False,
  166. help="If True, also send an email to the user. Defaults to False, as these users are often bots.",
  167. )
  168. @click.option(
  169. "--account-role",
  170. type=str,
  171. help="The name of an account role to filter for.",
  172. )
  173. @click.option(
  174. "--user-role",
  175. type=str,
  176. help="The name of a user role to filter for.",
  177. )
  178. @click.option(
  179. "--custom-user-message",
  180. type=str,
  181. default="",
  182. help="Add this message to the monitoring alert email to users (if one is sent).",
  183. )
  184. @click.option(
  185. "--only-newly-absent-users/--all-absent-users",
  186. type=bool,
  187. default=True,
  188. help="If True, a user is only included in this alert once after they were absent for too long. Defaults to True, so as to keep regular emails to low volume with newsworthy alerts.",
  189. )
  190. @click.option(
  191. "--task-name",
  192. type=str,
  193. default="monitor-last-seen-users",
  194. help="Optional name of the task, to distinguish finding out when the last monitoring happened (see --only-newly-absent-users).",
  195. )
  196. def monitor_last_seen(
  197. maximum_minutes_since_last_seen: int,
  198. alert_users: bool = False,
  199. account_role: str | None = None,
  200. user_role: str | None = None,
  201. custom_user_message: str | None = None,
  202. only_newly_absent_users: bool = True,
  203. task_name: str = "monitor-last-seen-users",
  204. ):
  205. """
  206. Check if given users last contact (via a request) happened less than the allowed time ago.
  207. Helpful for user accounts that are expected to contact FlexMeasures regularly (in an automated fashion).
  208. If the last contact was too long ago, we send alerts via Sentry, as well as emails to monitoring mail recipients.
  209. The user can be informed, as well.
  210. The set of users can be narrowed down by roles.
  211. Per default, this function will only alert you once per absent user (to avoid information overload).
  212. To (still) keep an overview over all absentees, we recommend to run this command in short regular intervals as-is
  213. and with --all-absent-users once per longer interval (e.g. per 24h).
  214. If you run distinct filters, you can use distinct task names, so the --only-newly-absent-users feature
  215. will work for all filters independently.
  216. """
  217. last_seen_delta = timedelta(minutes=maximum_minutes_since_last_seen)
  218. latest_run: LatestTaskRun = db.session.get(LatestTaskRun, task_name)
  219. # find users we haven't seen in the given time window (last_seen_at is naive UTC)
  220. users: list[User] = db.session.scalars(
  221. select(User).filter(
  222. User.last_seen_at < datetime.now(timezone.utc) - last_seen_delta
  223. )
  224. ).all()
  225. # role filters
  226. if account_role is not None:
  227. users = [user for user in users if user.account.has_role(account_role)]
  228. if user_role is not None:
  229. users = [user for user in users if user.has_role(user_role)]
  230. # filter out users who we already included in this check's last run
  231. txt_about_already_alerted_users = ""
  232. if only_newly_absent_users and latest_run:
  233. original_length = len(users)
  234. users = [
  235. user
  236. for user in users
  237. if user.last_seen_at.replace(tzinfo=timezone.utc) + last_seen_delta
  238. > latest_run.datetime
  239. ]
  240. if len(users) < original_length:
  241. txt_about_already_alerted_users = "There are (also) users who have been absent long, but one of the earlier monitoring runs already included them (run monitoring with --include-all-users-each-run to see them)."
  242. if not users:
  243. click.secho(
  244. f"All good ― no users were found with relevant criteria and last_seen_at longer than {maximum_minutes_since_last_seen} minutes ago. {txt_about_already_alerted_users}",
  245. **MsgStyle.SUCCESS,
  246. )
  247. raise click.Abort()
  248. # inform users & monitoring recipients
  249. if alert_users:
  250. for user in users:
  251. msg = (
  252. f"We noticed that user {user.username} has not been in contact with this FlexMeasures server"
  253. f" for at least {maximum_minutes_since_last_seen} minutes (last contact was {user.last_seen_at})."
  254. )
  255. if custom_user_message:
  256. msg += f"\n\n{custom_user_message}"
  257. else:
  258. msg += (
  259. "\nBy our own accounting, this should usually not happen."
  260. "\n\nMaybe you want to check if your local code is still working well."
  261. )
  262. email = Message(
  263. subject=f"Last contact by user {user.username} has been too long ago",
  264. recipients=[user.email],
  265. )
  266. email.body = msg
  267. app.mail.send(email)
  268. else:
  269. click.secho("Users are not being alerted.", **MsgStyle.ERROR)
  270. send_lastseen_monitoring_alert(
  271. users,
  272. last_seen_delta,
  273. alerted_users=alert_users,
  274. account_role=account_role,
  275. user_role=user_role,
  276. txt_about_already_alerted_users=txt_about_already_alerted_users,
  277. )
  278. # remember that we checked at this time
  279. LatestTaskRun.record_run(task_name, True)
  280. db.session.commit()
  281. app.cli.add_command(fm_monitor)