From 30600a11c5d44fa0b8a0d80d47583db177e7859b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Steven!=20Ragnar=C3=B6k?= Date: Thu, 7 Nov 2019 17:30:50 -0500 Subject: [PATCH 1/5] Expressly destroy a node's objects before the node. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This seems to reduce hangs during test runs described in https://github.com/ros2/build_cop/issues/248. The handles corresponding to the destroyed objects *should* be getting destroyed explicitly when self.handle.destroy() is called below. It seems however that when running with Fast-RTPS it's possible to get into a state where multiple threads are waiting on futexes and none can move forward. The rclpy context of this apparent deadlock is while clearing a node's list of publishers or services (possibly others, although publishers and services were the only ones observed). I consider this patch to be a workaround rather than a fix. I think there may either be a race condition between the rcl/rmw layer and the rmw implementation layer which is being tripped by the haphazardness of Python's garbage collector or there is a logical problem with the handle destruction ordering in rclpy that only Fast-RTPS is sensitive to. Signed-off-by: Steven! Ragnarök --- rclpy/rclpy/node.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/rclpy/rclpy/node.py b/rclpy/rclpy/node.py index 0c8295ca0..d07161ff6 100644 --- a/rclpy/rclpy/node.py +++ b/rclpy/rclpy/node.py @@ -1461,12 +1461,18 @@ def destroy_node(self) -> bool: # It will be destroyed with other publishers below. self._parameter_event_publisher = None - self.__publishers.clear() - self.__subscriptions.clear() - self.__clients.clear() - self.__services.clear() - self.__timers.clear() - self.__guards.clear() + for p in self.__publishers: + self.destroy_publisher(p) + for s in self.__subscriptions: + self.destroy_subscriber(s) + for c in self.__clients: + self.destroy_client(c) + for s in self.__services: + self.destroy_service(s) + for t in self.__timers: + self.destroy_timer(t) + for g in self.__guards: + self.destroy_guard(g) self.handle.destroy() self._wake_executor() From 03378671f3720798bd6c0efae0311dd2af74e5c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Steven!=20Ragnar=C3=B6k?= Date: Fri, 8 Nov 2019 08:09:36 -0500 Subject: [PATCH 2/5] Don't iterate over a collection being modified. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Steven! Ragnarök --- rclpy/rclpy/node.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/rclpy/rclpy/node.py b/rclpy/rclpy/node.py index d07161ff6..33c73b525 100644 --- a/rclpy/rclpy/node.py +++ b/rclpy/rclpy/node.py @@ -1461,18 +1461,18 @@ def destroy_node(self) -> bool: # It will be destroyed with other publishers below. self._parameter_event_publisher = None - for p in self.__publishers: - self.destroy_publisher(p) - for s in self.__subscriptions: - self.destroy_subscriber(s) - for c in self.__clients: - self.destroy_client(c) - for s in self.__services: - self.destroy_service(s) - for t in self.__timers: - self.destroy_timer(t) - for g in self.__guards: - self.destroy_guard(g) + while self.__publishers: + self.destroy_publisher(self.__publishers.pop()) + while self.__subscriptions: + self.destroy_subscriber(self.__subscriptions.pop()) + while self.__clients: + self.destroy_client(self.__clients.pop()) + while self.__services: + self.destroy_service(self.__services.pop()) + while self.__timers: + self.destroy_timer(self.__timers.pop()) + while self.__guards: + self.destroy_guard(self.__guards.pop()) self.handle.destroy() self._wake_executor() From deee908a0e6e92cedaf82613d5e454f597b80126 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Steven!=20Ragnaro=CC=88k?= Date: Fri, 8 Nov 2019 11:24:36 -0500 Subject: [PATCH 3/5] Fix whitespace and method name. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Steven! Ragnarök --- rclpy/rclpy/node.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/rclpy/rclpy/node.py b/rclpy/rclpy/node.py index 33c73b525..1399eb60d 100644 --- a/rclpy/rclpy/node.py +++ b/rclpy/rclpy/node.py @@ -1462,17 +1462,17 @@ def destroy_node(self) -> bool: self._parameter_event_publisher = None while self.__publishers: - self.destroy_publisher(self.__publishers.pop()) + self.destroy_publisher(self.__publishers.pop()) while self.__subscriptions: - self.destroy_subscriber(self.__subscriptions.pop()) + self.destroy_subscription(self.__subscriptions.pop()) while self.__clients: - self.destroy_client(self.__clients.pop()) + self.destroy_client(self.__clients.pop()) while self.__services: - self.destroy_service(self.__services.pop()) + self.destroy_service(self.__services.pop()) while self.__timers: - self.destroy_timer(self.__timers.pop()) + self.destroy_timer(self.__timers.pop()) while self.__guards: - self.destroy_guard(self.__guards.pop()) + self.destroy_guard(self.__guards.pop()) self.handle.destroy() self._wake_executor() From 025ffee52cfdd4e8dcb394566d56f4345a5384cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Steven!=20Ragnaro=CC=88k?= Date: Fri, 8 Nov 2019 11:37:18 -0500 Subject: [PATCH 4/5] Fix method name. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Steven! Ragnarök --- rclpy/rclpy/node.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rclpy/rclpy/node.py b/rclpy/rclpy/node.py index 1399eb60d..68772bd70 100644 --- a/rclpy/rclpy/node.py +++ b/rclpy/rclpy/node.py @@ -1472,7 +1472,7 @@ def destroy_node(self) -> bool: while self.__timers: self.destroy_timer(self.__timers.pop()) while self.__guards: - self.destroy_guard(self.__guards.pop()) + self.destroy_guard_condition(self.__guards.pop()) self.handle.destroy() self._wake_executor() From 503b3dbaa8701f5377e2b23e6930d263d6299106 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Steven!=20Ragnaro=CC=88k?= Date: Fri, 8 Nov 2019 14:30:22 -0500 Subject: [PATCH 5/5] Don't pre-emptively remove items from Node lists. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As pointed out by Shane, pop()ing each item from the list before passing it to the .destroy_ITEM() method prevents it from being destroyed as the individual methods first check that the item is present in the list then remove it before continuing to destroy it. Signed-off-by: Steven! Ragnarök --- rclpy/rclpy/node.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/rclpy/rclpy/node.py b/rclpy/rclpy/node.py index 68772bd70..94afaf499 100644 --- a/rclpy/rclpy/node.py +++ b/rclpy/rclpy/node.py @@ -1461,18 +1461,20 @@ def destroy_node(self) -> bool: # It will be destroyed with other publishers below. self._parameter_event_publisher = None + # Destroy dependent items eagerly to work around a possible hang + # https://github.com/ros2/build_cop/issues/248 while self.__publishers: - self.destroy_publisher(self.__publishers.pop()) + self.destroy_publisher(self.__publishers[0]) while self.__subscriptions: - self.destroy_subscription(self.__subscriptions.pop()) + self.destroy_subscription(self.__subscriptions[0]) while self.__clients: - self.destroy_client(self.__clients.pop()) + self.destroy_client(self.__clients[0]) while self.__services: - self.destroy_service(self.__services.pop()) + self.destroy_service(self.__services[0]) while self.__timers: - self.destroy_timer(self.__timers.pop()) + self.destroy_timer(self.__timers[0]) while self.__guards: - self.destroy_guard_condition(self.__guards.pop()) + self.destroy_guard_condition(self.__guards[0]) self.handle.destroy() self._wake_executor()