From 7ada7ee407d9ccd4fa9af4567193b21e8bd36aa4 Mon Sep 17 00:00:00 2001 From: Hoang Tran Date: Thu, 28 Nov 2024 13:51:59 +0000 Subject: [PATCH 1/5] add mouse position to env obs --- browsergym/core/src/browsergym/core/env.py | 15 +++++---------- .../core/src/browsergym/core/observation.py | 16 ++++++++++++++++ 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/browsergym/core/src/browsergym/core/env.py b/browsergym/core/src/browsergym/core/env.py index 30b565ba9..545b10726 100644 --- a/browsergym/core/src/browsergym/core/env.py +++ b/browsergym/core/src/browsergym/core/env.py @@ -15,16 +15,10 @@ from .action.highlevel import HighLevelActionSet from .chat import Chat from .constants import BROWSERGYM_ID_ATTRIBUTE, EXTRACT_OBS_MAX_TRIES -from .observation import ( - MarkingError, - _post_extract, - _pre_extract, - extract_dom_extra_properties, - extract_dom_snapshot, - extract_focused_element_bid, - extract_merged_axtree, - extract_screenshot, -) +from .observation import (MarkingError, _post_extract, _pre_extract, + extract_dom_extra_properties, extract_dom_snapshot, + extract_focused_element_bid, extract_merged_axtree, + extract_mouse_position, extract_screenshot) from .spaces import AnyBox, AnyDict, Float, Unicode from .task import AbstractBrowserTask @@ -581,6 +575,7 @@ def _get_obs(self): "last_action": self.last_action, "last_action_error": self.last_action_error, "elapsed_time": np.asarray([time.time() - self.start_time]), + "mouse_position": extract_mouse_position(self.page), } return obs diff --git a/browsergym/core/src/browsergym/core/observation.py b/browsergym/core/src/browsergym/core/observation.py index f13526603..8625c08b3 100644 --- a/browsergym/core/src/browsergym/core/observation.py +++ b/browsergym/core/src/browsergym/core/observation.py @@ -146,6 +146,22 @@ def extract_screenshot(page: playwright.sync_api.Page): return img +def extract_mouse_position(page: playwright.sync_api.Page): + """ + Extracts the mouse location on a Playwright page using a hacky JS code. + + Args: + page: the playwright page of which to extract the mouse location. + + Returns: + An array of the x and y coordinates of the mouse location. + """ + page.evaluate("document.addEventListener('mousemove', event => {window.pageX = event.clientX; window.pageY = event.clientY})") + position = page.evaluate("""() => { + return [window.pageX, window.pageY]; +}""") + return position + # we could handle more data items here if needed __BID_EXPR = r"([a-zA-Z0-9]+)" From ad79b7144a41d92637a5d079127e2e3a11b5961f Mon Sep 17 00:00:00 2001 From: Hoang Tran Date: Thu, 28 Nov 2024 13:58:22 +0000 Subject: [PATCH 2/5] move init to reset --- browsergym/core/src/browsergym/core/env.py | 2 +- browsergym/core/src/browsergym/core/observation.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/browsergym/core/src/browsergym/core/env.py b/browsergym/core/src/browsergym/core/env.py index 545b10726..a9e986bf8 100644 --- a/browsergym/core/src/browsergym/core/env.py +++ b/browsergym/core/src/browsergym/core/env.py @@ -265,7 +265,7 @@ def override_property(task, env, property): window.addEventListener("focusin", () => {window.browsergym_page_activated();}, {capture: true}); window.addEventListener("load", () => {window.browsergym_page_activated();}, {capture: true}); window.addEventListener("pageshow", () => {window.browsergym_page_activated();}, {capture: true}); -window.addEventListener("mousemove", () => {window.browsergym_page_activated();}, {capture: true}); +window.addEventListener("mousemove", (event) => {window.browsergym_page_activated(); window.pageX = event.clientX; window.pageY = event.clientY;}, {capture: true}); window.addEventListener("mouseup", () => {window.browsergym_page_activated();}, {capture: true}); window.addEventListener("mousedown", () => {window.browsergym_page_activated();}, {capture: true}); window.addEventListener("wheel", () => {window.browsergym_page_activated();}, {capture: true}); diff --git a/browsergym/core/src/browsergym/core/observation.py b/browsergym/core/src/browsergym/core/observation.py index 8625c08b3..f08f806bc 100644 --- a/browsergym/core/src/browsergym/core/observation.py +++ b/browsergym/core/src/browsergym/core/observation.py @@ -156,7 +156,6 @@ def extract_mouse_position(page: playwright.sync_api.Page): Returns: An array of the x and y coordinates of the mouse location. """ - page.evaluate("document.addEventListener('mousemove', event => {window.pageX = event.clientX; window.pageY = event.clientY})") position = page.evaluate("""() => { return [window.pageX, window.pageY]; }""") From a7014981c1df8a21939982caba7f64493761e25c Mon Sep 17 00:00:00 2001 From: Hoang Tran Date: Thu, 28 Nov 2024 15:06:54 +0000 Subject: [PATCH 3/5] fix tests --- browsergym/core/src/browsergym/core/env.py | 3 +++ tests/core/test_actions_highlevel.py | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/browsergym/core/src/browsergym/core/env.py b/browsergym/core/src/browsergym/core/env.py index a9e986bf8..3481b486d 100644 --- a/browsergym/core/src/browsergym/core/env.py +++ b/browsergym/core/src/browsergym/core/env.py @@ -151,6 +151,9 @@ def __init__( shape=(-1, -1, 3), dtype=np.uint8, ), # swapped axes (height, width, RGB) + "mouse_position": gym.spaces.Sequence( + Float() + ), "dom_object": AnyDict(), "axtree_object": AnyDict(), "extra_element_properties": AnyDict(), diff --git a/tests/core/test_actions_highlevel.py b/tests/core/test_actions_highlevel.py index a3a4f56c6..36ee22d9d 100644 --- a/tests/core/test_actions_highlevel.py +++ b/tests/core/test_actions_highlevel.py @@ -12,7 +12,8 @@ # register openended gym environments import browsergym.core from browsergym.core.action.highlevel import HighLevelActionSet -from browsergym.core.action.parsers import NamedArgument, highlevel_action_parser +from browsergym.core.action.parsers import (NamedArgument, + highlevel_action_parser) from browsergym.core.constants import BROWSERGYM_ID_ATTRIBUTE as BID_ATTR from browsergym.utils.obs import flatten_dom_to_str @@ -1141,6 +1142,7 @@ def get_checkbox_elem(obs): obs, reward, term, trunc, info = env.step(action) checkbox = get_checkbox_elem(obs) + assert obs['mouse_position'] == [x, y] # box not checked assert not obs["last_action_error"] From e9694a1ce19688bfaf7370ab7e7a65b41bed6b9f Mon Sep 17 00:00:00 2001 From: Hoang Tran Date: Fri, 6 Dec 2024 09:57:14 +0000 Subject: [PATCH 4/5] use tuple type --- browsergym/core/src/browsergym/core/env.py | 4 ++-- browsergym/core/src/browsergym/core/observation.py | 2 +- tests/core/test_actions_highlevel.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/browsergym/core/src/browsergym/core/env.py b/browsergym/core/src/browsergym/core/env.py index 3481b486d..a46bc0996 100644 --- a/browsergym/core/src/browsergym/core/env.py +++ b/browsergym/core/src/browsergym/core/env.py @@ -151,8 +151,8 @@ def __init__( shape=(-1, -1, 3), dtype=np.uint8, ), # swapped axes (height, width, RGB) - "mouse_position": gym.spaces.Sequence( - Float() + "mouse_position": gym.spaces.Tuple( + (Float(), Float()) ), "dom_object": AnyDict(), "axtree_object": AnyDict(), diff --git a/browsergym/core/src/browsergym/core/observation.py b/browsergym/core/src/browsergym/core/observation.py index f08f806bc..acdd6650a 100644 --- a/browsergym/core/src/browsergym/core/observation.py +++ b/browsergym/core/src/browsergym/core/observation.py @@ -159,7 +159,7 @@ def extract_mouse_position(page: playwright.sync_api.Page): position = page.evaluate("""() => { return [window.pageX, window.pageY]; }""") - return position + return (position[0], position[1]) # we could handle more data items here if needed diff --git a/tests/core/test_actions_highlevel.py b/tests/core/test_actions_highlevel.py index 36ee22d9d..ac8c72805 100644 --- a/tests/core/test_actions_highlevel.py +++ b/tests/core/test_actions_highlevel.py @@ -1142,7 +1142,7 @@ def get_checkbox_elem(obs): obs, reward, term, trunc, info = env.step(action) checkbox = get_checkbox_elem(obs) - assert obs['mouse_position'] == [x, y] + assert obs['mouse_position'] == (x, y) # box not checked assert not obs["last_action_error"] From 4202aab6a2b5a0d42f9c126452a6b6fb3231bb55 Mon Sep 17 00:00:00 2001 From: Hoang Tran Date: Sat, 7 Dec 2024 09:30:19 +0000 Subject: [PATCH 5/5] track last mousemove with iframe info --- browsergym/core/src/browsergym/core/env.py | 34 ++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/browsergym/core/src/browsergym/core/env.py b/browsergym/core/src/browsergym/core/env.py index a46bc0996..4ca5c4462 100644 --- a/browsergym/core/src/browsergym/core/env.py +++ b/browsergym/core/src/browsergym/core/env.py @@ -255,12 +255,17 @@ def override_property(task, env, property): # set default timeout self.context.set_default_timeout(timeout) - # hack: keep track of the active page with a javascript callback + # hack: keep track of the active page and mouse position with javascript callbacks # there is no concept of active page in playwright # https://github.com/microsoft/playwright/issues/2603 self.context.expose_binding( "browsergym_page_activated", lambda source: self._activate_page_from_js(source["page"]) ) + self.context.expose_binding( + "browsergym_mouse_moved", lambda source: self._update_mouse_position_from_js(source) + ) + # Initialize mouse position tracking + self.last_mouse_position = None self.context.add_init_script( r""" window.browsergym_page_activated(); @@ -268,7 +273,13 @@ def override_property(task, env, property): window.addEventListener("focusin", () => {window.browsergym_page_activated();}, {capture: true}); window.addEventListener("load", () => {window.browsergym_page_activated();}, {capture: true}); window.addEventListener("pageshow", () => {window.browsergym_page_activated();}, {capture: true}); -window.addEventListener("mousemove", (event) => {window.browsergym_page_activated(); window.pageX = event.clientX; window.pageY = event.clientY;}, {capture: true}); +window.addEventListener("mousemove", (event) => { + window.browsergym_page_activated(); + window.browsergym_mouse_moved({ + x: event.clientX, + y: event.clientY + }); +}, {capture: true}); window.addEventListener("mouseup", () => {window.browsergym_page_activated();}, {capture: true}); window.addEventListener("mousedown", () => {window.browsergym_page_activated();}, {capture: true}); window.addEventListener("wheel", () => {window.browsergym_page_activated();}, {capture: true}); @@ -482,6 +493,25 @@ def _wait_dom_loaded(self): except playwright.sync_api.Error: pass + def _update_mouse_position_from_js(self, source): + page = source["page"] + x = source["x"] + y = source["y"] + logger.debug(f"_update_mouse_position_from_js called, page={str(page)}, x={x}, y={y}") + + if not page.context == self.context: + raise RuntimeError( + f"Unexpected: mouse event from a page that belongs to a different browser context ({page})." + ) + + # Store the mouse position along with the page that received the event + self.last_mouse_position = { + "page": page, + "x": x, + "y": y, + "timestamp": time.time() + } + def _activate_page_from_js(self, page: playwright.sync_api.Page): logger.debug(f"_activate_page_from_js(page) called, page={str(page)}") if not page.context == self.context: