fix: Enhance screenshot functionality; add crosshair drawing and save screenshot to file
This commit is contained in:
parent
d7c4f9b0cb
commit
b583094e20
@ -3,13 +3,29 @@ import threading
|
||||
import time, io, base64
|
||||
import sys
|
||||
from objects.inputs import MouseInput, KeyboardInput, ButtonType
|
||||
from PIL import ImageGrab # type: ignore
|
||||
from PIL import ImageGrab, ImageDraw # type: ignore
|
||||
|
||||
def take_screenshot() -> bytes:
|
||||
def take_screenshot(cross_position: list[tuple[int, int]] | None = None) -> bytes:
|
||||
"""Take a screenshot of the current screen and return it as bytes."""
|
||||
screenshot = ImageGrab.grab()
|
||||
buf = io.BytesIO()
|
||||
|
||||
# Optionally draw a crosshair at the specified position
|
||||
if cross_position:
|
||||
for pos in cross_position:
|
||||
x, y = pos
|
||||
draw = ImageDraw.Draw(screenshot)
|
||||
size = 20 # half‐length of each arm
|
||||
color = (255, 0, 0)
|
||||
width = 2
|
||||
# horizontal line
|
||||
draw.line((x - size, y, x + size, y), fill=color, width=width)
|
||||
# vertical line
|
||||
draw.line((x, y - size, x, y + size), fill=color, width=width)
|
||||
|
||||
screenshot.save(buf, format='PNG')
|
||||
# save in a file
|
||||
screenshot.save("screenshot.png", format='PNG')
|
||||
return buf.getvalue()
|
||||
|
||||
def screenshot_to_base64(screenshot: bytes) -> str:
|
||||
|
@ -42,6 +42,7 @@ class AIProcessor:
|
||||
def process(self, prompt: str, img_data: str | bytes | None = None) -> list[str | dict]:
|
||||
outputs = [] # type: list[str | dict]
|
||||
reexec = True
|
||||
click_positions = [] # used for screenshot crosshair position
|
||||
nextsteps = ""
|
||||
try:
|
||||
self.session.messages.append(
|
||||
@ -69,6 +70,14 @@ class AIProcessor:
|
||||
except:
|
||||
nextsteps = str(tc.function.arguments)
|
||||
print('ERROR NEXT STEPS IS STR, ', nextsteps)
|
||||
if tc.function.name == "click_button":
|
||||
# extract click position for screenshot crosshair
|
||||
click_positions.extend(tuple( # button_type, x, y
|
||||
map(int,(tc.function.arguments.get("x", 0),
|
||||
tc.function.arguments.get("y", 0)
|
||||
)
|
||||
)
|
||||
))
|
||||
r = ai.compute._execute(
|
||||
name=tc.function.name,
|
||||
args=json.loads(tc.function.arguments),
|
||||
@ -96,15 +105,17 @@ class AIProcessor:
|
||||
if reexec:
|
||||
self.session.messages.append(
|
||||
aic.Message(
|
||||
role="user",
|
||||
content="Tool Output: Next Steps: " + nextsteps,
|
||||
role="assistant",
|
||||
content=str(tool_calls),
|
||||
)
|
||||
)
|
||||
|
||||
img = ai.compute.screenshot_to_base64(
|
||||
ai.compute.take_screenshot()
|
||||
ai.compute.take_screenshot(cross_position=click_positions)
|
||||
)
|
||||
|
||||
|
||||
|
||||
outputs.extend( self.process(nextsteps, img) )
|
||||
return [
|
||||
{
|
||||
|
@ -9,7 +9,7 @@ You are CopeAI Windows Agent. You are currently controlling a Windows 11 machine
|
||||
You are capable to see the screen, click buttons, type text, and interact with the system. \
|
||||
You will use the functions provided. The resolution of the machine is 1920x1080. \
|
||||
Your text response must indicate what you are doing. If the place where you clicked seems incorrect, \
|
||||
you will use everything you can to find the position of the location of the goal and click again."""
|
||||
you will use everything you can to find the position of the location of the goal and click again. You will see a red cross on where you previously clicked."""
|
||||
|
||||
FUNCTIONS = [
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user