fix: Enhance screenshot functionality; add crosshair drawing and save screenshot to file

This commit is contained in:
2025-05-19 13:39:26 +02:00
parent d7c4f9b0cb
commit b583094e20
3 changed files with 33 additions and 6 deletions

View File

@@ -3,13 +3,29 @@ import threading
import time, io, base64
import sys
from objects.inputs import MouseInput, KeyboardInput, ButtonType
from PIL import ImageGrab # type: ignore
from PIL import ImageGrab, ImageDraw # type: ignore
def take_screenshot() -> bytes:
def take_screenshot(cross_position: list[tuple[int, int]] | None = None) -> bytes:
"""Take a screenshot of the current screen and return it as bytes."""
screenshot = ImageGrab.grab()
buf = io.BytesIO()
# Optionally draw a crosshair at the specified position
if cross_position:
for pos in cross_position:
x, y = pos
draw = ImageDraw.Draw(screenshot)
size = 20 # halflength of each arm
color = (255, 0, 0)
width = 2
# horizontal line
draw.line((x - size, y, x + size, y), fill=color, width=width)
# vertical line
draw.line((x, y - size, x, y + size), fill=color, width=width)
screenshot.save(buf, format='PNG')
# save in a file
screenshot.save("screenshot.png", format='PNG')
return buf.getvalue()
def screenshot_to_base64(screenshot: bytes) -> str:

View File

@@ -42,6 +42,7 @@ class AIProcessor:
def process(self, prompt: str, img_data: str | bytes | None = None) -> list[str | dict]:
outputs = [] # type: list[str | dict]
reexec = True
click_positions = [] # used for screenshot crosshair position
nextsteps = ""
try:
self.session.messages.append(
@@ -69,6 +70,14 @@ class AIProcessor:
except:
nextsteps = str(tc.function.arguments)
print('ERROR NEXT STEPS IS STR, ', nextsteps)
if tc.function.name == "click_button":
# extract click position for screenshot crosshair
click_positions.extend(tuple( # button_type, x, y
map(int,(tc.function.arguments.get("x", 0),
tc.function.arguments.get("y", 0)
)
)
))
r = ai.compute._execute(
name=tc.function.name,
args=json.loads(tc.function.arguments),
@@ -96,15 +105,17 @@ class AIProcessor:
if reexec:
self.session.messages.append(
aic.Message(
role="user",
content="Tool Output: Next Steps: " + nextsteps,
role="assistant",
content=str(tool_calls),
)
)
img = ai.compute.screenshot_to_base64(
ai.compute.take_screenshot()
ai.compute.take_screenshot(cross_position=click_positions)
)
outputs.extend( self.process(nextsteps, img) )
return [
{