fix: Enhance screenshot functionality; add crosshair drawing and save screenshot to file

This commit is contained in:
Showdown76 2025-05-19 13:39:26 +02:00
parent d7c4f9b0cb
commit b583094e20
3 changed files with 33 additions and 6 deletions

View File

@ -3,13 +3,29 @@ import threading
import time, io, base64 import time, io, base64
import sys import sys
from objects.inputs import MouseInput, KeyboardInput, ButtonType from objects.inputs import MouseInput, KeyboardInput, ButtonType
from PIL import ImageGrab # type: ignore from PIL import ImageGrab, ImageDraw # type: ignore
def take_screenshot() -> bytes: def take_screenshot(cross_position: list[tuple[int, int]] | None = None) -> bytes:
"""Take a screenshot of the current screen and return it as bytes.""" """Take a screenshot of the current screen and return it as bytes."""
screenshot = ImageGrab.grab() screenshot = ImageGrab.grab()
buf = io.BytesIO() buf = io.BytesIO()
# Optionally draw a crosshair at the specified position
if cross_position:
for pos in cross_position:
x, y = pos
draw = ImageDraw.Draw(screenshot)
size = 20 # halflength of each arm
color = (255, 0, 0)
width = 2
# horizontal line
draw.line((x - size, y, x + size, y), fill=color, width=width)
# vertical line
draw.line((x, y - size, x, y + size), fill=color, width=width)
screenshot.save(buf, format='PNG') screenshot.save(buf, format='PNG')
# save in a file
screenshot.save("screenshot.png", format='PNG')
return buf.getvalue() return buf.getvalue()
def screenshot_to_base64(screenshot: bytes) -> str: def screenshot_to_base64(screenshot: bytes) -> str:

View File

@ -42,6 +42,7 @@ class AIProcessor:
def process(self, prompt: str, img_data: str | bytes | None = None) -> list[str | dict]: def process(self, prompt: str, img_data: str | bytes | None = None) -> list[str | dict]:
outputs = [] # type: list[str | dict] outputs = [] # type: list[str | dict]
reexec = True reexec = True
click_positions = [] # used for screenshot crosshair position
nextsteps = "" nextsteps = ""
try: try:
self.session.messages.append( self.session.messages.append(
@ -69,6 +70,14 @@ class AIProcessor:
except: except:
nextsteps = str(tc.function.arguments) nextsteps = str(tc.function.arguments)
print('ERROR NEXT STEPS IS STR, ', nextsteps) print('ERROR NEXT STEPS IS STR, ', nextsteps)
if tc.function.name == "click_button":
# extract click position for screenshot crosshair
click_positions.extend(tuple( # button_type, x, y
map(int,(tc.function.arguments.get("x", 0),
tc.function.arguments.get("y", 0)
)
)
))
r = ai.compute._execute( r = ai.compute._execute(
name=tc.function.name, name=tc.function.name,
args=json.loads(tc.function.arguments), args=json.loads(tc.function.arguments),
@ -96,15 +105,17 @@ class AIProcessor:
if reexec: if reexec:
self.session.messages.append( self.session.messages.append(
aic.Message( aic.Message(
role="user", role="assistant",
content="Tool Output: Next Steps: " + nextsteps, content=str(tool_calls),
) )
) )
img = ai.compute.screenshot_to_base64( img = ai.compute.screenshot_to_base64(
ai.compute.take_screenshot() ai.compute.take_screenshot(cross_position=click_positions)
) )
outputs.extend( self.process(nextsteps, img) ) outputs.extend( self.process(nextsteps, img) )
return [ return [
{ {

View File

@ -9,7 +9,7 @@ You are CopeAI Windows Agent. You are currently controlling a Windows 11 machine
You are capable to see the screen, click buttons, type text, and interact with the system. \ You are capable to see the screen, click buttons, type text, and interact with the system. \
You will use the functions provided. The resolution of the machine is 1920x1080. \ You will use the functions provided. The resolution of the machine is 1920x1080. \
Your text response must indicate what you are doing. If the place where you clicked seems incorrect, \ Your text response must indicate what you are doing. If the place where you clicked seems incorrect, \
you will use everything you can to find the position of the location of the goal and click again.""" you will use everything you can to find the position of the location of the goal and click again. You will see a red cross on where you previously clicked."""
FUNCTIONS = [ FUNCTIONS = [
{ {