fix: Enhance screenshot functionality; add crosshair drawing and save screenshot to file
This commit is contained in:
parent
d7c4f9b0cb
commit
b583094e20
@ -3,13 +3,29 @@ import threading
|
|||||||
import time, io, base64
|
import time, io, base64
|
||||||
import sys
|
import sys
|
||||||
from objects.inputs import MouseInput, KeyboardInput, ButtonType
|
from objects.inputs import MouseInput, KeyboardInput, ButtonType
|
||||||
from PIL import ImageGrab # type: ignore
|
from PIL import ImageGrab, ImageDraw # type: ignore
|
||||||
|
|
||||||
def take_screenshot() -> bytes:
|
def take_screenshot(cross_position: list[tuple[int, int]] | None = None) -> bytes:
|
||||||
"""Take a screenshot of the current screen and return it as bytes."""
|
"""Take a screenshot of the current screen and return it as bytes."""
|
||||||
screenshot = ImageGrab.grab()
|
screenshot = ImageGrab.grab()
|
||||||
buf = io.BytesIO()
|
buf = io.BytesIO()
|
||||||
|
|
||||||
|
# Optionally draw a crosshair at the specified position
|
||||||
|
if cross_position:
|
||||||
|
for pos in cross_position:
|
||||||
|
x, y = pos
|
||||||
|
draw = ImageDraw.Draw(screenshot)
|
||||||
|
size = 20 # half‐length of each arm
|
||||||
|
color = (255, 0, 0)
|
||||||
|
width = 2
|
||||||
|
# horizontal line
|
||||||
|
draw.line((x - size, y, x + size, y), fill=color, width=width)
|
||||||
|
# vertical line
|
||||||
|
draw.line((x, y - size, x, y + size), fill=color, width=width)
|
||||||
|
|
||||||
screenshot.save(buf, format='PNG')
|
screenshot.save(buf, format='PNG')
|
||||||
|
# save in a file
|
||||||
|
screenshot.save("screenshot.png", format='PNG')
|
||||||
return buf.getvalue()
|
return buf.getvalue()
|
||||||
|
|
||||||
def screenshot_to_base64(screenshot: bytes) -> str:
|
def screenshot_to_base64(screenshot: bytes) -> str:
|
||||||
|
@ -42,6 +42,7 @@ class AIProcessor:
|
|||||||
def process(self, prompt: str, img_data: str | bytes | None = None) -> list[str | dict]:
|
def process(self, prompt: str, img_data: str | bytes | None = None) -> list[str | dict]:
|
||||||
outputs = [] # type: list[str | dict]
|
outputs = [] # type: list[str | dict]
|
||||||
reexec = True
|
reexec = True
|
||||||
|
click_positions = [] # used for screenshot crosshair position
|
||||||
nextsteps = ""
|
nextsteps = ""
|
||||||
try:
|
try:
|
||||||
self.session.messages.append(
|
self.session.messages.append(
|
||||||
@ -69,6 +70,14 @@ class AIProcessor:
|
|||||||
except:
|
except:
|
||||||
nextsteps = str(tc.function.arguments)
|
nextsteps = str(tc.function.arguments)
|
||||||
print('ERROR NEXT STEPS IS STR, ', nextsteps)
|
print('ERROR NEXT STEPS IS STR, ', nextsteps)
|
||||||
|
if tc.function.name == "click_button":
|
||||||
|
# extract click position for screenshot crosshair
|
||||||
|
click_positions.extend(tuple( # button_type, x, y
|
||||||
|
map(int,(tc.function.arguments.get("x", 0),
|
||||||
|
tc.function.arguments.get("y", 0)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
))
|
||||||
r = ai.compute._execute(
|
r = ai.compute._execute(
|
||||||
name=tc.function.name,
|
name=tc.function.name,
|
||||||
args=json.loads(tc.function.arguments),
|
args=json.loads(tc.function.arguments),
|
||||||
@ -96,15 +105,17 @@ class AIProcessor:
|
|||||||
if reexec:
|
if reexec:
|
||||||
self.session.messages.append(
|
self.session.messages.append(
|
||||||
aic.Message(
|
aic.Message(
|
||||||
role="user",
|
role="assistant",
|
||||||
content="Tool Output: Next Steps: " + nextsteps,
|
content=str(tool_calls),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
img = ai.compute.screenshot_to_base64(
|
img = ai.compute.screenshot_to_base64(
|
||||||
ai.compute.take_screenshot()
|
ai.compute.take_screenshot(cross_position=click_positions)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
outputs.extend( self.process(nextsteps, img) )
|
outputs.extend( self.process(nextsteps, img) )
|
||||||
return [
|
return [
|
||||||
{
|
{
|
||||||
|
@ -9,7 +9,7 @@ You are CopeAI Windows Agent. You are currently controlling a Windows 11 machine
|
|||||||
You are capable to see the screen, click buttons, type text, and interact with the system. \
|
You are capable to see the screen, click buttons, type text, and interact with the system. \
|
||||||
You will use the functions provided. The resolution of the machine is 1920x1080. \
|
You will use the functions provided. The resolution of the machine is 1920x1080. \
|
||||||
Your text response must indicate what you are doing. If the place where you clicked seems incorrect, \
|
Your text response must indicate what you are doing. If the place where you clicked seems incorrect, \
|
||||||
you will use everything you can to find the position of the location of the goal and click again."""
|
you will use everything you can to find the position of the location of the goal and click again. You will see a red cross on where you previously clicked."""
|
||||||
|
|
||||||
FUNCTIONS = [
|
FUNCTIONS = [
|
||||||
{
|
{
|
||||||
|
Loading…
Reference in New Issue
Block a user