fix: Enhance screenshot functionality; add crosshair drawing and save screenshot to file
This commit is contained in:
		@@ -3,13 +3,29 @@ import threading
 | 
			
		||||
import time, io, base64
 | 
			
		||||
import sys
 | 
			
		||||
from objects.inputs import MouseInput, KeyboardInput, ButtonType
 | 
			
		||||
from PIL import ImageGrab  # type: ignore
 | 
			
		||||
from PIL import ImageGrab, ImageDraw  # type: ignore
 | 
			
		||||
 | 
			
		||||
def take_screenshot() -> bytes:
 | 
			
		||||
def take_screenshot(cross_position: list[tuple[int, int]] | None = None) -> bytes:
 | 
			
		||||
    """Take a screenshot of the current screen and return it as bytes."""
 | 
			
		||||
    screenshot = ImageGrab.grab()
 | 
			
		||||
    buf = io.BytesIO()
 | 
			
		||||
 | 
			
		||||
    # Optionally draw a crosshair at the specified position
 | 
			
		||||
    if cross_position:
 | 
			
		||||
        for pos in cross_position:
 | 
			
		||||
            x, y = pos
 | 
			
		||||
            draw = ImageDraw.Draw(screenshot)
 | 
			
		||||
            size = 20      # half‐length of each arm
 | 
			
		||||
            color = (255, 0, 0)
 | 
			
		||||
            width = 2
 | 
			
		||||
            # horizontal line
 | 
			
		||||
            draw.line((x - size, y, x + size, y), fill=color, width=width)
 | 
			
		||||
            # vertical line
 | 
			
		||||
            draw.line((x, y - size, x, y + size), fill=color, width=width)
 | 
			
		||||
 | 
			
		||||
    screenshot.save(buf, format='PNG')
 | 
			
		||||
    # save in a file
 | 
			
		||||
    screenshot.save("screenshot.png", format='PNG')
 | 
			
		||||
    return buf.getvalue()
 | 
			
		||||
 | 
			
		||||
def screenshot_to_base64(screenshot: bytes) -> str:
 | 
			
		||||
 
 | 
			
		||||
@@ -42,6 +42,7 @@ class AIProcessor:
 | 
			
		||||
    def process(self, prompt: str, img_data: str | bytes | None = None) -> list[str | dict]:
 | 
			
		||||
        outputs = []  # type: list[str | dict]
 | 
			
		||||
        reexec = True
 | 
			
		||||
        click_positions = []  # used for screenshot crosshair position
 | 
			
		||||
        nextsteps = ""
 | 
			
		||||
        try:
 | 
			
		||||
            self.session.messages.append(
 | 
			
		||||
@@ -69,6 +70,14 @@ class AIProcessor:
 | 
			
		||||
                        except:
 | 
			
		||||
                            nextsteps = str(tc.function.arguments)
 | 
			
		||||
                            print('ERROR NEXT STEPS IS STR, ', nextsteps)
 | 
			
		||||
                    if tc.function.name == "click_button":
 | 
			
		||||
                        # extract click position for screenshot crosshair
 | 
			
		||||
                        click_positions.extend(tuple( # button_type, x, y
 | 
			
		||||
                            map(int,(tc.function.arguments.get("x", 0),
 | 
			
		||||
                                    tc.function.arguments.get("y", 0)
 | 
			
		||||
                                )
 | 
			
		||||
                            )
 | 
			
		||||
                        ))
 | 
			
		||||
                    r = ai.compute._execute(
 | 
			
		||||
                        name=tc.function.name,
 | 
			
		||||
                        args=json.loads(tc.function.arguments),
 | 
			
		||||
@@ -96,15 +105,17 @@ class AIProcessor:
 | 
			
		||||
                if reexec:
 | 
			
		||||
                    self.session.messages.append(
 | 
			
		||||
                        aic.Message(
 | 
			
		||||
                            role="user",
 | 
			
		||||
                            content="Tool Output: Next Steps: " + nextsteps,
 | 
			
		||||
                            role="assistant",
 | 
			
		||||
                            content=str(tool_calls),
 | 
			
		||||
                        )
 | 
			
		||||
                    )
 | 
			
		||||
 | 
			
		||||
                    img = ai.compute.screenshot_to_base64(
 | 
			
		||||
                        ai.compute.take_screenshot()
 | 
			
		||||
                        ai.compute.take_screenshot(cross_position=click_positions)
 | 
			
		||||
                    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
                    outputs.extend( self.process(nextsteps, img) )
 | 
			
		||||
                return [
 | 
			
		||||
                    {
 | 
			
		||||
 
 | 
			
		||||
@@ -9,7 +9,7 @@ You are CopeAI Windows Agent. You are currently controlling a Windows 11 machine
 | 
			
		||||
You are capable to see the screen, click buttons, type text, and interact with the system. \
 | 
			
		||||
You will use the functions provided. The resolution of the machine is 1920x1080. \
 | 
			
		||||
Your text response must indicate what you are doing. If the place where you clicked seems incorrect, \
 | 
			
		||||
you will use everything you can to find the position of the location of the goal and click again."""
 | 
			
		||||
you will use everything you can to find the position of the location of the goal and click again. You will see a red cross on where you previously clicked."""
 | 
			
		||||
 | 
			
		||||
FUNCTIONS = [
 | 
			
		||||
    {
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user