PTG-Kitware · cameron-a-johnson · Jun 7, 2024 · Jun 7, 2024 · Jun 9, 2024 · Jun 10, 2024
diff --git a/angel_system/global_step_prediction/global_step_predictor.py b/angel_system/global_step_prediction/global_step_predictor.py
@@ -360,6 +360,7 @@ def reset_one_tracker(self, tracker_ind):
         self.trackers[tracker_ind]["current_granular_step"] = 0
         self.trackers[tracker_ind]["active"] = True
         self.tracker_resets.append(self.trackers[tracker_ind]["recipe"])
+        return self.trackers
 
     def granular_to_broad_step(self, tracker, granular_step):
         """

diff --git a/ansible/roles/provision-files/vars/main.yml b/ansible/roles/provision-files/vars/main.yml
@@ -122,15 +122,15 @@ girder_file_downloads:
       sha512: 7183385f8eaca85997725a107a76034de2bd4a59c1434b4bdb7c1ac8931cf4b68a53f6e736734643386364b9f0856de795a14965b6a02bc5eb5891252e6a73c9
       dest: "{{ stage_dirs.object_detector }}/r18_det.pt"
     # Activity classifier
-    - file_id: 666c8f5c35faf6f99fbb43ea
-      sha512: e8207b092f19659217f51dc41ba8c964ccb9c664d23d235b6166f8974c19eda3b1a098c451f6caa17bfc41148aea249eeeabdf794e2c4eff3c09bc2f009978ef
+    - file_id: 66463a80687336214e7cdeaa
+      sha512: 11c417480ef3b178cea5e9ba01878e2c1d2978129db4d8ac5d7eb0a1e6bdec0da74fc54f6bd69c8d04311809fc78503fad5c57fe45c5ee09f0f4f68197e34be1
       dest: "{{ stage_dirs.activity_classifier }}/r18_tcn.ckpt"
     - file_id: 6606b6e9aa5c8de3874c3f4a
       sha512: 3c84333390ee6b12327bb7e5debed37149c7f95cc437b16939f77b599d1a0b3b8c4f0995820b95973170b94df695494000a1c45fbf46632267c212e125fe58a3
       dest: "{{ stage_dirs.activity_classifier }}/r18_mapping.txt"
     # Global Step predictor model
-    - file_id: 666c8ef435faf6f99fbb43e8
-      sha512: ff121c00a9406b21c7c193cbbf8a2a271fc4374f05709f170565e5b64617c41fef18281653968d43740648888d0aee9c9ab4009eb791857c9a28bd6063384807
+    - file_id: 66464bf9687336214e7cdeae
+      sha512: bc7884c258cb8704372dd69a3e7d999eaf70d716d271982077c7216ef27ab01219ef1e488f6f9669e11a7e8aa6ffb9d8e07d74edc47294f90cc9e6a723f5a822
       dest: "{{ stage_dirs.task_monitor }}/global_step_predictor_act_avgs_R18.npy"
 
 # List of git repositories to check out at a specific ref and then archive.

diff --git a/config/activity_labels/medical/r18-demo.yaml b/config/activity_labels/medical/r18-demo.yaml
@@ -9,16 +9,16 @@ labels:
     full_str: "background"
   - id: 1
     label: "cover-seal-wound"
-    full_str: "With your non-dominant hand, cover and seal wound site."
+    full_str: "With your hand, cover and seal wound site."
   - id: 2
     label: "open-pack"
-    full_str: "With your dominant hand, open vented chest seal package. Keep your non-dominant hand on the wound site."
+    full_str: "Open vented chest seal package, while keeping wound site sealed."
   - id: 3
     label: "clean-wound-site"
-    full_str: "With your dominant hand, wipe blood and body fluids from wound site."
+    full_str: "Wipe blood and body fluids from wound site, while keeping wound site sealed."
   - id: 4
     label: "peel-seal-backer"
-    full_str: "Peel away chest seal backer."
+    full_str: "Peel away chest seal backer, while keeping wound site sealed."
   - id: 5
     label: "place-seal"
     full_str: "Place chest seal with circle of vents over wound site and seal to chest."
diff --git a/config/llm_prompts/r18_steps_prompt b/config/llm_prompts/r18_steps_prompt
@@ -1,5 +1,5 @@
 ###
-You are a professional medic teaching me how to best save another persons life.
+You are a professional medic guiding me through the process of applying a chest seal.
 
 Here are frequently asked questions:
 Question: What is a chest seal and why is it used? 
@@ -24,24 +24,25 @@ Question: What if the chest seal gets clogged with blood?
 Answer: Monitor the person. If breathing worsens, burp the seal or replace it.
 
 Here are the instructions of the current task you are trying to teach me:
-1) With gloved hand, cover and seal wound site.
-2) Open vented chest seal package.
-3) Wipe blood and body fluids from wound site.
-4) Peel away chest seal backer.
+1) With your hand, cover and seal wound site.
+2) Open vented chest seal package, while keeping wound site sealed.
+3) Wipe blood and body fluids from wound site, while keeping wound site sealed.
+4) Peel away chest seal backer, while keeping wound site sealed.
 5) Place chest seal with circle of vents over wound site and seal to chest.
 {taskactivity}
 
 When you answer my question, follow the these rules: 
 * Use information from the instructions above.
-* Is should not deviate from the instructions, except when medical instruments are not available.
-* If I ask a question not related to medicine, answer with: "Sorry, I can't help you with that". 
-* You should always resond in a conversational tone. 
+* Your answer should not deviate from the instructions, except when I have troubles with the instructions.
+* Focus on question related to medicine, the objects used in the instructions, the user and the environment of the user.
+* You should always resond in a conversational tone, very direct and precise.
 * DO NOT ANSWER "I'm sorry, I am an AI language model and I cannot see or perceive anything."
 * Don't use the phrasing "However,.."
 * Don't say "Based on the information you provided,"
 * Do not read the entire instrutions to me. Just give me one at a time.
-
+* Keep your answers very short, blunt and straightforward. I have to act fast.
+* Do not call the attched image an 'image'. Pretend this is what you and I can see. 
 ###
 
 {question}
-Your answer (very short, precise, helpful with empathy):
+Your answer (very short, precise):
diff --git a/config/tasks/medical/r18-demo.yaml b/config/tasks/medical/r18-demo.yaml
@@ -15,19 +15,19 @@ labels:
     activity_ids: [0]
   - id: 1
     label: "cover-seal-wound"
-    full_str: "With your non-dominant hand, cover and seal wound site."
+    full_str: "With your hand, cover and seal wound site."
     activity_ids: [1]
   - id: 2
     label: "open-pack"
-    full_str: "With your dominant hand, open vented chest seal package. Keep your non-dominant hand on the wound site."
+    full_str: "Open vented chest seal package, while keeping wound site sealed."
     activity_ids: [2]
   - id: 3
     label: "clean-wound-site"
-    full_str: "With your dominant hand, wipe blood and body fluids from wound site."
+    full_str: "Wipe blood and body fluids from wound site, while keeping wound site sealed."
     activity_ids: [3]
   - id: 4
     label: "peel-seal-backer"
-    full_str: "Peel away chest seal backer."
+    full_str: "Peel away chest seal backer, while keeping wound site sealed."
     activity_ids: [4]
   - id: 5
     label: "place-seal"

diff --git a/ros/angel_msgs/msg/DialogueUtterance.msg b/ros/angel_msgs/msg/DialogueUtterance.msg
@@ -11,6 +11,9 @@ std_msgs/Header header
 # Speech-to-text of the user utterance we have interpreted
 string utterance_text
 
+# Image frame at the time the user utterance was captured
+string pov_frame
+
 # Below are optional fields 
 
 # Canonical user intent that has been interpreted. "Canonical" in this context

diff --git a/ros/angel_system_nodes/angel_system_nodes/audio/asr.py b/ros/angel_system_nodes/angel_system_nodes/audio/asr.py
@@ -1,19 +1,27 @@
 import json
-import numpy as np
 import queue
 import requests
 import soundfile
 import tempfile
 from termcolor import colored
 import threading
+import base64
+import PIL.Image
+import io
+import numpy as np
+from cv_bridge import CvBridge
+from sensor_msgs.msg import Image
 
+import nltk
+nltk.download('punkt')
 from nltk.tokenize import sent_tokenize
 import rclpy
 
 from angel_msgs.msg import HeadsetAudioData, DialogueUtterance, SystemTextResponse
 from angel_system_nodes.audio import dialogue
 from angel_utils import make_default_main
 
+BRIDGE = CvBridge()
 
 AUDIO_TOPIC = "audio_topic"
 UTTERANCES_TOPIC = "utterances_topic"
@@ -22,6 +30,7 @@
 ASR_REQ_SEGMENT_SECONDS_DURATION = "asr_req_segment_duration"
 IS_SENTENCE_TOKENIZE = "is_sentence_tokenize"
 DEBUG_MODE = "debug_mode"
+IMAGE_TOPIC = "image_topic"
 
 # TODO (derekahmed) We should figure out how this value was derived
 # and make this a constant accordingly.
@@ -40,6 +49,7 @@ def __init__(self):
             ASR_REQ_SEGMENT_SECONDS_DURATION,
             IS_SENTENCE_TOKENIZE,
             DEBUG_MODE,
+            IMAGE_TOPIC,
         ]
         set_parameters = self.declare_parameters(
             namespace="",
@@ -75,6 +85,9 @@ def __init__(self):
         self._feedback_topic = (
             self.get_parameter(FEEDBACK_TOPIC).get_parameter_value().string_value
         )
+        self._image_topic = (
+            self.get_parameter(IMAGE_TOPIC).get_parameter_value().string_value
+        )
 
         self.log.info(
             f"Audio topic: "
@@ -118,6 +131,18 @@ def __init__(self):
             SystemTextResponse, self._feedback_topic, 1
         )
 
+        # Single slot for latest image message to process detection over.
+        self.pov_frame: Image = ""
+
+        self.log.info("Creating subscription to image topic")
+        # Initialize ROS hooks
+        self.subscription = self.create_subscription(
+            Image,
+            self._image_topic,
+            self.process_image_callback,
+            1,
+        )
+
         self.audio_stream = []
         self.t = threading.Thread()
         self.prev_timestamp = None
@@ -140,6 +165,17 @@ def __init__(self):
     def listener_callback(self, msg):
         self.message_queue.put(msg)
 
+    def process_image_callback(self, image: Image):
+        # image is type sensor_msgs.msg encoding BGR8
+        img0 = BRIDGE.imgmsg_to_cv2(image, desired_encoding="bgr8")
+
+        # Convert img0 into RGB and create a PIL image instance.
+        img_rgb = PIL.Image.fromarray(img0[:, :, ::-1], mode="RGB")
+        img_rgb = img_rgb.resize(np.divide(img_rgb.size, 4).astype(int))
+        jpg_container = io.BytesIO()
+        img_rgb.save(jpg_container, format="JPEG")
+        self.pov_frame = base64.b64encode(jpg_container.getvalue()).decode("utf-8")
+
     def process_message_queue(self):
         while True:
             msg = self.message_queue.get()
@@ -220,22 +256,46 @@ def asr_server_request_thread(self, audio_data, num_channels, sample_rate):
                     self._publish_text(response_text)
 
     def _publish_text(self, text: str):
+
+        self.log.info("Utterance was: " + f'"{text}"')
+
+        if (
+            "angela" not in text.lower()
+            and "angel" not in text.lower()
+            and "angela," not in text.lower()
+            and "angel," not in text.lower()
+        ):
+            # If Angel keyword is not found, don't publish the utterance
+            return
+
+        self.log.info("Publish thinking feedback")
+        self.publish_feedback_response()
+
         published_msg = DialogueUtterance()
         published_msg.header.frame_id = "ASR"
         published_msg.header.stamp = self.get_clock().now().to_msg()
-        published_msg.utterance_text = text
+
+        # Find the index of the first occurrence of the word
+        result_text = text
+        keywords = ["angela", "angela,", "angel", "angel,"]
+        for word in keywords:
+            index = text.lower().find(word)
+            if index != -1:
+                # Remove everything before the word
+                result_text = text[index+6:]
+                break
+
+        published_msg.utterance_text = result_text
+
+        if self.pov_frame is None or len(self.pov_frame)<=1:
+            published_msg.pov_frame = ""
+            self.log.info("No pov frame available")
+        else:
+            published_msg.pov_frame = self.pov_frame
+            self.log.info("Adding pov frame to utterance..")
         colored_utterance = colored(published_msg.utterance_text, "light_blue")
         self.log.info("Publishing message: " + f'"{colored_utterance}"')
 
-        if (
-            "angela" in text.lower()
-            or "angel" in text.lower()
-            or "angela," in text.lower()
-            or "angel," in text.lower()
-        ):
-            self.log.info("Publish thinking feedback")
-            self.publish_feedback_response()
-
         self._publisher.publish(published_msg)
 
     def publish_feedback_response(self):

diff --git a/ros/angel_system_nodes/angel_system_nodes/audio/dialogue.py b/ros/angel_system_nodes/angel_system_nodes/audio/dialogue.py
@@ -17,6 +17,7 @@ def _copy_dialogue_utterance(
         msg.header.stamp = copy_time
 
         msg.utterance_text = src_msg.utterance_text
+        msg.pov_frame = src_msg.pov_frame
 
         # Copy all optional fields below.
 

diff --git a/ros/angel_system_nodes/angel_system_nodes/audio/emotion/gpt_emotion_detector.py b/ros/angel_system_nodes/angel_system_nodes/audio/emotion/gpt_emotion_detector.py
@@ -76,7 +76,7 @@ def _labels_list_str(labels):
             example_separator="\n",
         )
         openai_llm = ChatOpenAI(
-            model_name="gpt-3.5-turbo",
+            model_name="gpt-4o",
             openai_api_key=self.openai_api_key,
             temperature=0.0,
             max_tokens=1,

diff --git a/ros/angel_system_nodes/angel_system_nodes/audio/intent/gpt_intent_detector.py b/ros/angel_system_nodes/angel_system_nodes/audio/intent/gpt_intent_detector.py
@@ -78,7 +78,7 @@ def _labels_list_str(labels):
 
         # Please refer to https://github.com/hwchase17/langchain/blob/master/langchain/llms/openai.py
         openai_llm = ChatOpenAI(
-            model_name="gpt-3.5-turbo",
+            model_name="gpt-4o",
             openai_api_key=self.openai_api_key,
             temperature=0.0,
             # Only 2 tokens needed for classification (tokens are delimited by use of '_', i.e.