Skip to content

Commit 8420cfd

Browse files
authored
Fix acknowledge reminder task (#5179)
# What this PR does - Adds 10 minutes lock for acknowledge reminder task to prevent task duplicates, that causes posting multiple reminder messages and flooding in Slack threads. - Adds a new signal for acknowledge reminder task instead of using `alert_group_action_triggered_signal` since it is used only to post reminder message in Slack thread and it's not needed to be processed by other representatives ## Which issue(s) this PR closes Related to grafana/oncall-private#2953 ## Checklist - [x] Unit, integration, and e2e (if applicable) tests updated - [x] Documentation added (or `pr:no public docs` PR label added if not required) - [x] Added the relevant release notes label (see labels prefixed w/ `release:`). These labels dictate how your PR will show up in the autogenerated release notes.
1 parent 4667960 commit 8420cfd

File tree

6 files changed

+138
-29
lines changed

6 files changed

+138
-29
lines changed

engine/apps/alerts/signals.py

+7
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@
2424
# Signal to rerender alert group's resolution note in all connected integrations (Slack)
2525
alert_group_update_resolution_note_signal = django.dispatch.Signal()
2626

27+
# Signal to post acknowledge reminder message (Slack)
28+
post_ack_reminder_message_signal = django.dispatch.Signal()
29+
2730
# Currently only writes error in Slack thread while notify user. Maybe it is worth to delete it?
2831
user_notification_action_triggered_signal = django.dispatch.Signal()
2932

@@ -40,6 +43,10 @@
4043
AlertGroupSlackRepresentative.on_alert_group_update_resolution_note,
4144
)
4245

46+
post_ack_reminder_message_signal.connect(
47+
AlertGroupSlackRepresentative.on_alert_group_post_acknowledge_reminder_message,
48+
)
49+
4350
user_notification_action_triggered_signal.connect(
4451
UserSlackRepresentative.on_user_action_triggered,
4552
)

engine/apps/alerts/tasks/acknowledge_reminder.py

+30-4
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,25 @@
22
from functools import partial
33

44
from django.conf import settings
5+
from django.core.cache import cache
56
from django.db import transaction
67
from django.utils import timezone
78

9+
from apps.alerts.signals import post_ack_reminder_message_signal
810
from common.custom_celery_tasks import shared_dedicated_queue_retry_task
911

1012
from .send_alert_group_signal import send_alert_group_signal
1113
from .task_logger import task_logger
1214

13-
MAX_RETRIES = 1 if settings.DEBUG else None
15+
MAX_RETRIES = 1 if settings.DEBUG else 10
16+
17+
18+
def is_allowed_to_send_acknowledge_reminder(alert_group_id, process_id):
19+
lock_id = f"acknowledge-reminder-lock-{alert_group_id}"
20+
lock_period = 60 * 10 # 10 min
21+
# cache.add returns False if the key already exists
22+
status = cache.add(lock_id, process_id, lock_period)
23+
return status
1424

1525

1626
@shared_dedicated_queue_retry_task(autoretry_for=(Exception,), retry_backoff=True, max_retries=MAX_RETRIES)
@@ -28,6 +38,11 @@ def acknowledge_reminder_task(alert_group_pk: int, unacknowledge_process_id: str
2838
if unacknowledge_process_id != alert_group.last_unique_unacknowledge_process_id:
2939
return
3040

41+
# Don't proceed if acknowledge reminder for this alert group has already been sent recently
42+
if not is_allowed_to_send_acknowledge_reminder(alert_group.id, unacknowledge_process_id):
43+
task_logger.info(f"Acknowledge reminder for alert_group {alert_group_pk} has already been sent recently.")
44+
return
45+
3146
organization = alert_group.channel.organization
3247

3348
# Get timeout values
@@ -55,8 +70,9 @@ def acknowledge_reminder_task(alert_group_pk: int, unacknowledge_process_id: str
5570

5671
# unacknowledge_timeout_task uses acknowledged_by_confirmed to check if acknowledgement reminder has been confirmed
5772
# by the user. Setting to None here to indicate that the user has not confirmed the acknowledgement reminder
58-
alert_group.acknowledged_by_confirmed = None
59-
alert_group.save(update_fields=["acknowledged_by_confirmed"])
73+
if alert_group.acknowledged_by_confirmed is not None:
74+
alert_group.acknowledged_by_confirmed = None
75+
alert_group.save(update_fields=["acknowledged_by_confirmed"])
6076

6177
if unacknowledge_timeout: # "unack in N minutes if no response" is enabled
6278
unacknowledge_timeout_task.apply_async(
@@ -77,7 +93,7 @@ def acknowledge_reminder_task(alert_group_pk: int, unacknowledge_process_id: str
7793
type=AlertGroupLogRecord.TYPE_ACK_REMINDER_TRIGGERED, author=alert_group.acknowledged_by_user
7894
)
7995
task_logger.info(f"created log record {log_record.pk}, sending signal...")
80-
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
96+
transaction.on_commit(partial(send_post_ack_reminder_message_signal.delay, log_record.pk))
8197

8298

8399
@shared_dedicated_queue_retry_task(autoretry_for=(Exception,), retry_backoff=True, max_retries=MAX_RETRIES)
@@ -138,3 +154,13 @@ def unacknowledge_timeout_task(alert_group_pk: int, unacknowledge_process_id: st
138154
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
139155
alert_group.unacknowledge()
140156
alert_group.start_escalation_if_needed()
157+
158+
159+
@shared_dedicated_queue_retry_task(autoretry_for=(Exception,), retry_backoff=True, max_retries=MAX_RETRIES)
160+
def send_post_ack_reminder_message_signal(log_record_id):
161+
"""
162+
Sends signal to post acknowledge reminder message to Slack thread.
163+
The signal is connected to AlertGroupSlackRepresentative.
164+
"""
165+
task_logger.info(f"sending signal for posting ack reminder message, log record {log_record_id}")
166+
post_ack_reminder_message_signal.send(sender=send_post_ack_reminder_message_signal, log_record=log_record_id)

engine/apps/alerts/tests/test_acknowledge_reminder.py

+52-5
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,13 @@
33

44
import pytest
55
from celery import uuid as celery_uuid
6+
from django.core.cache import cache
67
from django.utils import timezone
78

89
from apps.alerts.constants import ActionSource
910
from apps.alerts.models import AlertGroup, AlertGroupLogRecord
1011
from apps.alerts.tasks import acknowledge_reminder_task
11-
from apps.alerts.tasks.acknowledge_reminder import unacknowledge_timeout_task
12+
from apps.alerts.tasks.acknowledge_reminder import send_post_ack_reminder_message_signal, unacknowledge_timeout_task
1213
from apps.user_management.models import Organization
1314

1415
TASK_ID = "TASK_ID"
@@ -156,8 +157,10 @@ def test_acknowledge_reminder_task_skip(
156157

157158
@patch.object(unacknowledge_timeout_task, "apply_async")
158159
@patch.object(acknowledge_reminder_task, "apply_async")
160+
@patch.object(send_post_ack_reminder_message_signal, "delay")
159161
@pytest.mark.django_db
160-
def test_acknowledge_reminder_task_reschedules_itself(
162+
def test_acknowledge_reminder_task_reschedules_itself_and_sends_signal(
163+
mock_send_post_ack_reminder_message_signal,
161164
mock_acknowledge_reminder_task,
162165
mock_unacknowledge_timeout_task,
163166
ack_reminder_test_setup,
@@ -169,9 +172,6 @@ def test_acknowledge_reminder_task_reschedules_itself(
169172
with django_capture_on_commit_callbacks(execute=True) as callbacks:
170173
acknowledge_reminder_task(alert_group.pk, TASK_ID)
171174

172-
# send_alert_group_signal task is queued after commit
173-
assert len(callbacks) == 1
174-
175175
mock_unacknowledge_timeout_task.assert_not_called()
176176
mock_acknowledge_reminder_task.assert_called_once_with(
177177
(alert_group.pk, TASK_ID),
@@ -182,6 +182,10 @@ def test_acknowledge_reminder_task_reschedules_itself(
182182
assert log_record.type == AlertGroupLogRecord.TYPE_ACK_REMINDER_TRIGGERED
183183
assert log_record.author == alert_group.acknowledged_by_user
184184

185+
# send_post_ack_reminder_message_signal task is queued after commit
186+
assert len(callbacks) == 1
187+
mock_send_post_ack_reminder_message_signal.assert_called_once_with(log_record.id)
188+
185189

186190
@patch.object(unacknowledge_timeout_task, "apply_async")
187191
@patch.object(acknowledge_reminder_task, "apply_async")
@@ -369,3 +373,46 @@ def test_ack_reminder_cancel_too_old(
369373
mock_acknowledge_reminder_task.assert_not_called()
370374

371375
assert not alert_group.log_records.exists()
376+
377+
378+
@pytest.mark.django_db
379+
def test_acknowledge_reminder_skip_doubled_notification(
380+
ack_reminder_test_setup,
381+
django_capture_on_commit_callbacks,
382+
caplog,
383+
):
384+
organization, alert_group, user = ack_reminder_test_setup(
385+
unacknowledge_timeout=Organization.UNACKNOWLEDGE_TIMEOUT_NEVER
386+
)
387+
expected_log_text = f"Acknowledge reminder for alert_group {alert_group.id} has already been sent recently."
388+
389+
# check task lock cache doesn't exist
390+
lock_id = f"acknowledge-reminder-lock-{alert_group.id}"
391+
assert cache.get(lock_id) is None
392+
393+
with patch.object(acknowledge_reminder_task, "apply_async") as mock_acknowledge_reminder_task:
394+
with patch.object(send_post_ack_reminder_message_signal, "delay") as mock_send_signal:
395+
with django_capture_on_commit_callbacks(execute=True):
396+
acknowledge_reminder_task(alert_group.pk, TASK_ID)
397+
398+
log_record = alert_group.log_records.get()
399+
assert log_record.type == AlertGroupLogRecord.TYPE_ACK_REMINDER_TRIGGERED
400+
mock_send_signal.assert_called_once_with(log_record.id)
401+
mock_acknowledge_reminder_task.assert_called_once()
402+
403+
# check task lock cache exists
404+
assert cache.get(lock_id) == TASK_ID
405+
406+
assert expected_log_text not in caplog.text
407+
408+
# acknowledge_reminder_task doesn't proceed the second time if it has been called recently
409+
with patch.object(acknowledge_reminder_task, "apply_async") as mock_acknowledge_reminder_task:
410+
with patch.object(send_post_ack_reminder_message_signal, "delay") as mock_send_signal:
411+
with django_capture_on_commit_callbacks(execute=True):
412+
acknowledge_reminder_task(alert_group.pk, TASK_ID)
413+
414+
assert alert_group.log_records.count() == 1
415+
mock_send_signal.assert_not_called()
416+
mock_acknowledge_reminder_task.assert_not_called()
417+
418+
assert expected_log_text in caplog.text

engine/apps/slack/representatives/alert_group_representative.py

+48-19
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
import typing
23

34
from celery.utils.log import get_task_logger
45
from django.conf import settings
@@ -9,6 +10,9 @@
910
from apps.slack.scenarios.scenario_step import ScenarioStep
1011
from common.custom_celery_tasks import shared_dedicated_queue_retry_task
1112

13+
if typing.TYPE_CHECKING:
14+
from apps.alerts.models import AlertGroupLogRecord
15+
1216
logger = get_task_logger(__name__)
1317
logger.setLevel(logging.DEBUG)
1418

@@ -130,29 +134,17 @@ def on_create_alert(cls, **kwargs):
130134
@classmethod
131135
def on_alert_group_action_triggered(cls, **kwargs):
132136
logger.debug("Received alert_group_action_triggered signal in SLACK representative")
133-
from apps.alerts.models import AlertGroupLogRecord
134-
135-
log_record = kwargs["log_record"]
136-
force_sync = kwargs.get("force_sync", False)
137-
if isinstance(log_record, AlertGroupLogRecord):
138-
log_record_id = log_record.pk
139-
else:
140-
log_record_id = log_record
141-
142-
try:
143-
log_record = AlertGroupLogRecord.objects.get(pk=log_record_id)
144-
except AlertGroupLogRecord.DoesNotExist:
145-
logger.warning(
146-
f"on_alert_group_action_triggered: log record {log_record_id} never created or has been deleted"
147-
)
137+
log_record = cls.get_log_record_from_kwargs(**kwargs)
138+
if not log_record:
148139
return
140+
force_sync = kwargs.get("force_sync", False)
149141

150142
if log_record.action_source == ActionSource.SLACK or force_sync:
151-
logger.debug(f"SLACK on_alert_group_action_triggered: sync {log_record_id} {force_sync}")
152-
on_alert_group_action_triggered_async(log_record_id)
143+
logger.debug(f"SLACK on_alert_group_action_triggered: sync {log_record.id} {force_sync}")
144+
on_alert_group_action_triggered_async(log_record.id)
153145
else:
154-
logger.debug(f"SLACK on_alert_group_action_triggered: async {log_record_id} {force_sync}")
155-
on_alert_group_action_triggered_async.apply_async((log_record_id,))
146+
logger.debug(f"SLACK on_alert_group_action_triggered: async {log_record.id} {force_sync}")
147+
on_alert_group_action_triggered_async.apply_async((log_record.id,))
156148

157149
@classmethod
158150
def on_alert_group_update_resolution_note(cls, **kwargs):
@@ -167,6 +159,26 @@ def on_alert_group_update_resolution_note(cls, **kwargs):
167159
step = UpdateResolutionNoteStep(organization.slack_team_identity, organization)
168160
step.process_signal(alert_group, resolution_note)
169161

162+
@classmethod
163+
def on_alert_group_post_acknowledge_reminder_message(cls, **kwargs):
164+
log_record = cls.get_log_record_from_kwargs(**kwargs)
165+
if not log_record:
166+
return
167+
slack_team_identity = log_record.alert_group.channel.organization.slack_team_identity
168+
alert_group = log_record.alert_group
169+
logger.debug(
170+
f"Received post_ack_reminder_message_signal signal in SLACK representative for alert_group {alert_group.id}"
171+
)
172+
if not (log_record.alert_group.slack_message and slack_team_identity):
173+
logger.debug(
174+
f"SLACK representative is NOT applicable for alert_group {alert_group.id}, log record {log_record.id}"
175+
)
176+
return
177+
178+
AcknowledgeConfirmationStep = ScenarioStep.get_step("distribute_alerts", "AcknowledgeConfirmationStep")
179+
step = AcknowledgeConfirmationStep(slack_team_identity)
180+
step.process_signal(log_record)
181+
170182
def on_acknowledge(self):
171183
AcknowledgeGroupStep = ScenarioStep.get_step("distribute_alerts", "AcknowledgeGroupStep")
172184
step = AcknowledgeGroupStep(self.log_record.alert_group.channel.organization.slack_team_identity)
@@ -229,6 +241,7 @@ def on_auto_un_acknowledge(self):
229241
self.on_un_acknowledge()
230242

231243
def on_ack_reminder_triggered(self):
244+
# deprecated, remove this handler after release
232245
AcknowledgeConfirmationStep = ScenarioStep.get_step("distribute_alerts", "AcknowledgeConfirmationStep")
233246
step = AcknowledgeConfirmationStep(self.log_record.alert_group.channel.organization.slack_team_identity)
234247
step.process_signal(self.log_record)
@@ -258,3 +271,19 @@ def get_handler_name(self):
258271
@classmethod
259272
def on_handler_not_found(cls):
260273
pass
274+
275+
@classmethod
276+
def get_log_record_from_kwargs(cls, **kwargs) -> typing.Optional["AlertGroupLogRecord"]:
277+
from apps.alerts.models import AlertGroupLogRecord
278+
279+
log_record = None
280+
log_record_id = kwargs["log_record"]
281+
282+
if not isinstance(log_record_id, AlertGroupLogRecord):
283+
try:
284+
log_record = AlertGroupLogRecord.objects.get(pk=log_record_id)
285+
except AlertGroupLogRecord.DoesNotExist:
286+
logger.warning(f"log record {log_record_id} never created or has been deleted")
287+
else:
288+
log_record = log_record_id
289+
return log_record

engine/apps/telegram/alert_group_representative.py

-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ def get_handlers_map():
4141
AlertGroupLogRecord.TYPE_AUTO_UN_ACK: "alert_group_action",
4242
AlertGroupLogRecord.TYPE_RESOLVED: "alert_group_action",
4343
AlertGroupLogRecord.TYPE_UN_RESOLVED: "alert_group_action",
44-
AlertGroupLogRecord.TYPE_ACK_REMINDER_TRIGGERED: "alert_group_action",
4544
AlertGroupLogRecord.TYPE_SILENCE: "alert_group_action",
4645
AlertGroupLogRecord.TYPE_UN_SILENCE: "alert_group_action",
4746
AlertGroupLogRecord.TYPE_ATTACHED: "alert_group_action",

engine/settings/celery_task_routes.py

+1
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@
9393
# CRITICAL
9494
"apps.alerts.tasks.acknowledge_reminder.acknowledge_reminder_task": {"queue": "critical"},
9595
"apps.alerts.tasks.acknowledge_reminder.unacknowledge_timeout_task": {"queue": "critical"},
96+
"apps.alerts.tasks.acknowledge_reminder.send_post_ack_reminder_message_signal": {"queue": "critical"},
9697
"apps.alerts.tasks.declare_incident.declare_incident": {"queue": "critical"},
9798
"apps.alerts.tasks.distribute_alert.send_alert_create_signal": {"queue": "critical"},
9899
"apps.alerts.tasks.escalate_alert_group.escalate_alert_group": {"queue": "critical"},

0 commit comments

Comments
 (0)