From b583094e205761c1af4695652393a8030f999691 Mon Sep 17 00:00:00 2001 From: Showdown76py Date: Mon, 19 May 2025 13:39:26 +0200 Subject: [PATCH] fix: Enhance screenshot functionality; add crosshair drawing and save screenshot to file --- ai/compute.py | 20 ++++++++++++++++++-- ai/processor.py | 17 ++++++++++++++--- objects/aic.py | 2 +- 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/ai/compute.py b/ai/compute.py index 808389c..b9fe727 100644 --- a/ai/compute.py +++ b/ai/compute.py @@ -3,13 +3,29 @@ import threading import time, io, base64 import sys from objects.inputs import MouseInput, KeyboardInput, ButtonType -from PIL import ImageGrab # type: ignore +from PIL import ImageGrab, ImageDraw # type: ignore -def take_screenshot() -> bytes: +def take_screenshot(cross_position: list[tuple[int, int]] | None = None) -> bytes: """Take a screenshot of the current screen and return it as bytes.""" screenshot = ImageGrab.grab() buf = io.BytesIO() + + # Optionally draw a crosshair at the specified position + if cross_position: + for pos in cross_position: + x, y = pos + draw = ImageDraw.Draw(screenshot) + size = 20 # half‐length of each arm + color = (255, 0, 0) + width = 2 + # horizontal line + draw.line((x - size, y, x + size, y), fill=color, width=width) + # vertical line + draw.line((x, y - size, x, y + size), fill=color, width=width) + screenshot.save(buf, format='PNG') + # save in a file + screenshot.save("screenshot.png", format='PNG') return buf.getvalue() def screenshot_to_base64(screenshot: bytes) -> str: diff --git a/ai/processor.py b/ai/processor.py index 7da90fc..c454063 100644 --- a/ai/processor.py +++ b/ai/processor.py @@ -42,6 +42,7 @@ class AIProcessor: def process(self, prompt: str, img_data: str | bytes | None = None) -> list[str | dict]: outputs = [] # type: list[str | dict] reexec = True + click_positions = [] # used for screenshot crosshair position nextsteps = "" try: self.session.messages.append( @@ -69,6 +70,14 @@ class AIProcessor: except: nextsteps = str(tc.function.arguments) print('ERROR NEXT STEPS IS STR, ', nextsteps) + if tc.function.name == "click_button": + # extract click position for screenshot crosshair + click_positions.extend(tuple( # button_type, x, y + map(int,(tc.function.arguments.get("x", 0), + tc.function.arguments.get("y", 0) + ) + ) + )) r = ai.compute._execute( name=tc.function.name, args=json.loads(tc.function.arguments), @@ -96,15 +105,17 @@ class AIProcessor: if reexec: self.session.messages.append( aic.Message( - role="user", - content="Tool Output: Next Steps: " + nextsteps, + role="assistant", + content=str(tool_calls), ) ) img = ai.compute.screenshot_to_base64( - ai.compute.take_screenshot() + ai.compute.take_screenshot(cross_position=click_positions) ) + + outputs.extend( self.process(nextsteps, img) ) return [ { diff --git a/objects/aic.py b/objects/aic.py index a1019bd..9fcd03f 100644 --- a/objects/aic.py +++ b/objects/aic.py @@ -9,7 +9,7 @@ You are CopeAI Windows Agent. You are currently controlling a Windows 11 machine You are capable to see the screen, click buttons, type text, and interact with the system. \ You will use the functions provided. The resolution of the machine is 1920x1080. \ Your text response must indicate what you are doing. If the place where you clicked seems incorrect, \ -you will use everything you can to find the position of the location of the goal and click again.""" +you will use everything you can to find the position of the location of the goal and click again. You will see a red cross on where you previously clicked.""" FUNCTIONS = [ {