Skip to content

Commit b907303

Browse files
committed
qubes/vm: Improve stopped event handling
The previous version did not ensure that the stopped/shutdown event was handled before a new VM start. This can easily lead to problems like in QubesOS/qubes-issues#3164. This improved version now ensures that the stopped/shutdown events are handled before a new VM start. Additionally this version should be more robust against unreliable events from libvirt. It handles missing, duplicated and delayed stopped events. Instead of one 'domain-shutdown' event there are now 'domain-stopped' and 'domain-shutdown'. The later is generated after the former. This way it's easy to run code after the VM is shutdown including the stop of it's storage.
1 parent 682d950 commit b907303

File tree

4 files changed

+126
-37
lines changed

4 files changed

+126
-37
lines changed

qubes/app.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1183,7 +1183,7 @@ def _domain_event_callback(self, _conn, domain, event, _detail, _opaque):
11831183
return
11841184

11851185
if event == libvirt.VIR_DOMAIN_EVENT_STOPPED:
1186-
vm.fire_event('domain-shutdown')
1186+
vm.on_libvirt_domain_stopped()
11871187

11881188
@qubes.events.handler('domain-pre-delete')
11891189
def on_domain_pre_deleted(self, event, vm):

qubes/tests/integ/basic.py

-1
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,6 @@ def _test_201_domain_shutdown_handler(self, vm, event, **kwargs):
137137
self.test_failure_reason = 'domain-shutdown event received twice'
138138
self.domain_shutdown_handled = True
139139

140-
@unittest.expectedFailure
141140
def test_201_shutdown_event_race(self):
142141
'''Regression test for 3164 - pure events edition'''
143142
vmname = self.make_vm_name('appvm')

qubes/vm/dispvm.py

+12-16
Original file line numberDiff line numberDiff line change
@@ -129,18 +129,18 @@ def on_property_pre_set_template(self, event, name, newvalue=None,
129129
raise qubes.exc.QubesValueError(self,
130130
'Cannot change template of Disposable VM')
131131

132+
@qubes.events.handler('domain-shutdown')
132133
@asyncio.coroutine
133-
def on_domain_shutdown_coro(self):
134-
'''Coroutine for executing cleanup after domain shutdown.
134+
def on_domain_shutdown(self, _event, **_kwargs):
135+
yield from self._auto_cleanup()
135136

136-
This override default action defined in QubesVM.on_domain_shutdown_coro
137-
'''
138-
with (yield from self.startup_lock):
139-
yield from self.storage.stop()
140-
if self.auto_cleanup and self in self.app.domains:
141-
yield from self.remove_from_disk()
142-
del self.app.domains[self]
143-
self.app.save()
137+
@asyncio.coroutine
138+
def _auto_cleanup(self):
139+
'''Do auto cleanup if enabled'''
140+
if self.auto_cleanup and self in self.app.domains:
141+
yield from self.remove_from_disk()
142+
del self.app.domains[self]
143+
self.app.save()
144144

145145
@classmethod
146146
@asyncio.coroutine
@@ -206,10 +206,6 @@ def start(self, **kwargs):
206206

207207
yield from super(DispVM, self).start(**kwargs)
208208
except:
209-
# cleanup also on failed startup; there is potential race with
210-
# self.on_domain_shutdown_coro, so check if wasn't already removed
211-
if self.auto_cleanup and self in self.app.domains:
212-
yield from self.remove_from_disk()
213-
del self.app.domains[self]
214-
self.app.save()
209+
# Cleanup also on failed startup
210+
yield from self._auto_cleanup()
215211
raise

qubes/vm/qubesvm.py

+113-19
Original file line numberDiff line numberDiff line change
@@ -151,9 +151,33 @@ class QubesVM(qubes.vm.mix.net.NetVMMixin, qubes.vm.BaseVM):
151151
152152
*other arguments are as in :py:meth:`start`*
153153
154+
.. event:: domain-stopped (subject, event)
155+
156+
Fired when domain has been stopped.
157+
158+
This event is emitted before ``'domain-shutdown'`` and will trigger
159+
the cleanup in QubesVM. So if you require that the cleanup has
160+
already run use ``'domain-shutdown'``.
161+
162+
Note that you can receive this event as soon as you received
163+
``'domain-pre-start'``. This also can be emitted in case of a
164+
startup failure, before or after ``'domain-start-failed'``.
165+
166+
Handler for this event can be asynchronous (a coroutine).
167+
168+
:param subject: Event emitter (the qube object)
169+
:param event: Event name (``'domain-stopped'``)
170+
154171
.. event:: domain-shutdown (subject, event)
155172
156-
Fired when domain has been shut down.
173+
Fired when domain has been shut down. It is generated after
174+
``'domain-stopped'``.
175+
176+
Note that you can receive this event as soon as you received
177+
``'domain-pre-start'``. This also can be emitted in case of a
178+
startup failure, before or after ``'domain-start-failed'``.
179+
180+
Handler for this event can be asynchronous (a coroutine).
157181
158182
:param subject: Event emitter (the qube object)
159183
:param event: Event name (``'domain-shutdown'``)
@@ -643,6 +667,17 @@ def __init__(self, app, xml, volume_config=None, **kwargs):
643667
self._libvirt_domain = None
644668
self._qdb_connection = None
645669

670+
# We assume a fully halted VM here. The 'domain-init' handler will
671+
# check if the VM is already running.
672+
self._domain_stopped_event_received = True
673+
self._domain_stopped_event_handled = True
674+
675+
self._domain_stopped_future = None
676+
677+
# Internal lock to ensure ordering between _domain_stopped_coro() and
678+
# start(). This should not be accessed anywhere else.
679+
self._domain_stopped_lock = asyncio.Lock()
680+
646681
if xml is None:
647682
# we are creating new VM and attributes came through kwargs
648683
assert hasattr(self, 'qid')
@@ -712,6 +747,8 @@ def on_domain_init_loaded(self, event):
712747

713748
if not self.app.vmm.offline_mode and self.is_running():
714749
self.start_qdb_watch()
750+
self._domain_stopped_event_received = False
751+
self._domain_stopped_event_handled = False
715752

716753
@qubes.events.handler('property-set:label')
717754
def on_property_set_label(self, event, name, newvalue, oldvalue=None):
@@ -797,6 +834,30 @@ def start(self, start_guid=True, notify_function=None,
797834
if self.get_power_state() != 'Halted':
798835
return
799836

837+
with (yield from self._domain_stopped_lock):
838+
# Don't accept any new stopped event's till a new VM has been
839+
# created. If we didn't received any stopped event or it wasn't
840+
# handled yet we will handle this in the next lines.
841+
self._domain_stopped_event_received = True
842+
843+
if self._domain_stopped_future is not None:
844+
# Libvirt stopped event was already received, so cancel the
845+
# future. If it didn't generate the Qubes events yet we
846+
# will do it below.
847+
self._domain_stopped_future.cancel()
848+
self._domain_stopped_future = None
849+
850+
if not self._domain_stopped_event_handled:
851+
# No Qubes domain-stopped events have been generated yet.
852+
# So do this now.
853+
854+
# Set this immediately such that we don't generate events
855+
# twice if an exception gets thrown.
856+
self._domain_stopped_event_handled = True
857+
858+
yield from self.fire_event_async('domain-stopped')
859+
yield from self.fire_event_async('domain-shutdown')
860+
800861
self.log.info('Starting {}'.format(self.name))
801862

802863
yield from self.fire_event_async('domain-pre-start',
@@ -816,14 +877,17 @@ def start(self, start_guid=True, notify_function=None,
816877
qmemman_client = yield from asyncio.get_event_loop().\
817878
run_in_executor(None, self.request_memory, mem_required)
818879

880+
yield from self.storage.start()
881+
819882
except Exception as exc:
820883
# let anyone receiving domain-pre-start know that startup failed
821884
yield from self.fire_event_async('domain-start-failed',
822885
reason=str(exc))
886+
if qmemman_client:
887+
qmemman_client.close()
823888
raise
824889

825890
try:
826-
yield from self.storage.start()
827891
self._update_libvirt_domain()
828892

829893
self.libvirt_domain.createWithFlags(
@@ -833,12 +897,16 @@ def start(self, start_guid=True, notify_function=None,
833897
# let anyone receiving domain-pre-start know that startup failed
834898
yield from self.fire_event_async('domain-start-failed',
835899
reason=str(exc))
900+
self.storage.stop()
836901
raise
837902

838903
finally:
839904
if qmemman_client:
840905
qmemman_client.close()
841906

907+
self._domain_stopped_event_received = False
908+
self._domain_stopped_event_handled = False
909+
842910
try:
843911
yield from self.fire_event_async('domain-spawn',
844912
start_guid=start_guid)
@@ -870,24 +938,50 @@ def start(self, start_guid=True, notify_function=None,
870938

871939
return self
872940

873-
@asyncio.coroutine
874-
def on_domain_shutdown_coro(self):
875-
'''Coroutine for executing cleanup after domain shutdown.
876-
Do not allow domain to be started again until this finishes.
941+
def on_libvirt_domain_stopped(self):
942+
''' Handle VIR_DOMAIN_EVENT_STOPPED events from libvirt.
943+
944+
This is not a Qubes event handler. Instead we do some sanity checks
945+
and synchronization with start() and then emits Qubes events.
877946
'''
878-
with (yield from self.startup_lock):
879-
try:
880-
yield from self.storage.stop()
881-
except qubes.storage.StoragePoolException:
882-
self.log.exception('Failed to stop storage for domain %s',
883-
self.name)
884-
885-
@qubes.events.handler('domain-shutdown')
886-
def on_domain_shutdown(self, _event, **_kwargs):
887-
'''Cleanup after domain shutdown'''
888-
# TODO: ensure that domain haven't been started _before_ this
889-
# coroutine got a chance to acquire a lock
890-
asyncio.ensure_future(self.on_domain_shutdown_coro())
947+
948+
state = self.get_power_state()
949+
if state not in ['Halted', 'Crashed', 'Dying']:
950+
self.log.warning('Stopped event from libvirt received,'
951+
' but domain is in state {}!'.format(state))
952+
# ignore this unexpected event
953+
return
954+
955+
if self._domain_stopped_event_received:
956+
self.log.warning('Duplicated stopped event from libvirt received!')
957+
# ignore this unexpected event
958+
return
959+
960+
self._domain_stopped_event_received = True
961+
self._domain_stopped_future = \
962+
asyncio.ensure_future(self._domain_stopped_coro())
963+
964+
@asyncio.coroutine
965+
def _domain_stopped_coro(self):
966+
with (yield from self._domain_stopped_lock):
967+
assert not self._domain_stopped_event_handled
968+
969+
# Set this immediately such that we don't generate events twice if
970+
# an exception gets thrown.
971+
self._domain_stopped_event_handled = True
972+
973+
yield from self.fire_event_async('domain-stopped')
974+
yield from self.fire_event_async('domain-shutdown')
975+
976+
@qubes.events.handler('domain-stopped')
977+
@asyncio.coroutine
978+
def on_domain_stopped(self, _event, **_kwargs):
979+
'''Cleanup after domain was stopped'''
980+
try:
981+
yield from self.storage.stop()
982+
except qubes.storage.StoragePoolException:
983+
self.log.exception('Failed to stop storage for domain %s',
984+
self.name)
891985

892986
@asyncio.coroutine
893987
def shutdown(self, force=False, wait=False):

0 commit comments

Comments
 (0)