fix: Enhance screenshot functionality; add crosshair drawing and save screenshot to file
This commit is contained in:
		@@ -3,13 +3,29 @@ import threading
 | 
				
			|||||||
import time, io, base64
 | 
					import time, io, base64
 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
from objects.inputs import MouseInput, KeyboardInput, ButtonType
 | 
					from objects.inputs import MouseInput, KeyboardInput, ButtonType
 | 
				
			||||||
from PIL import ImageGrab  # type: ignore
 | 
					from PIL import ImageGrab, ImageDraw  # type: ignore
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def take_screenshot() -> bytes:
 | 
					def take_screenshot(cross_position: list[tuple[int, int]] | None = None) -> bytes:
 | 
				
			||||||
    """Take a screenshot of the current screen and return it as bytes."""
 | 
					    """Take a screenshot of the current screen and return it as bytes."""
 | 
				
			||||||
    screenshot = ImageGrab.grab()
 | 
					    screenshot = ImageGrab.grab()
 | 
				
			||||||
    buf = io.BytesIO()
 | 
					    buf = io.BytesIO()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Optionally draw a crosshair at the specified position
 | 
				
			||||||
 | 
					    if cross_position:
 | 
				
			||||||
 | 
					        for pos in cross_position:
 | 
				
			||||||
 | 
					            x, y = pos
 | 
				
			||||||
 | 
					            draw = ImageDraw.Draw(screenshot)
 | 
				
			||||||
 | 
					            size = 20      # half‐length of each arm
 | 
				
			||||||
 | 
					            color = (255, 0, 0)
 | 
				
			||||||
 | 
					            width = 2
 | 
				
			||||||
 | 
					            # horizontal line
 | 
				
			||||||
 | 
					            draw.line((x - size, y, x + size, y), fill=color, width=width)
 | 
				
			||||||
 | 
					            # vertical line
 | 
				
			||||||
 | 
					            draw.line((x, y - size, x, y + size), fill=color, width=width)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    screenshot.save(buf, format='PNG')
 | 
					    screenshot.save(buf, format='PNG')
 | 
				
			||||||
 | 
					    # save in a file
 | 
				
			||||||
 | 
					    screenshot.save("screenshot.png", format='PNG')
 | 
				
			||||||
    return buf.getvalue()
 | 
					    return buf.getvalue()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def screenshot_to_base64(screenshot: bytes) -> str:
 | 
					def screenshot_to_base64(screenshot: bytes) -> str:
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -42,6 +42,7 @@ class AIProcessor:
 | 
				
			|||||||
    def process(self, prompt: str, img_data: str | bytes | None = None) -> list[str | dict]:
 | 
					    def process(self, prompt: str, img_data: str | bytes | None = None) -> list[str | dict]:
 | 
				
			||||||
        outputs = []  # type: list[str | dict]
 | 
					        outputs = []  # type: list[str | dict]
 | 
				
			||||||
        reexec = True
 | 
					        reexec = True
 | 
				
			||||||
 | 
					        click_positions = []  # used for screenshot crosshair position
 | 
				
			||||||
        nextsteps = ""
 | 
					        nextsteps = ""
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            self.session.messages.append(
 | 
					            self.session.messages.append(
 | 
				
			||||||
@@ -69,6 +70,14 @@ class AIProcessor:
 | 
				
			|||||||
                        except:
 | 
					                        except:
 | 
				
			||||||
                            nextsteps = str(tc.function.arguments)
 | 
					                            nextsteps = str(tc.function.arguments)
 | 
				
			||||||
                            print('ERROR NEXT STEPS IS STR, ', nextsteps)
 | 
					                            print('ERROR NEXT STEPS IS STR, ', nextsteps)
 | 
				
			||||||
 | 
					                    if tc.function.name == "click_button":
 | 
				
			||||||
 | 
					                        # extract click position for screenshot crosshair
 | 
				
			||||||
 | 
					                        click_positions.extend(tuple( # button_type, x, y
 | 
				
			||||||
 | 
					                            map(int,(tc.function.arguments.get("x", 0),
 | 
				
			||||||
 | 
					                                    tc.function.arguments.get("y", 0)
 | 
				
			||||||
 | 
					                                )
 | 
				
			||||||
 | 
					                            )
 | 
				
			||||||
 | 
					                        ))
 | 
				
			||||||
                    r = ai.compute._execute(
 | 
					                    r = ai.compute._execute(
 | 
				
			||||||
                        name=tc.function.name,
 | 
					                        name=tc.function.name,
 | 
				
			||||||
                        args=json.loads(tc.function.arguments),
 | 
					                        args=json.loads(tc.function.arguments),
 | 
				
			||||||
@@ -96,15 +105,17 @@ class AIProcessor:
 | 
				
			|||||||
                if reexec:
 | 
					                if reexec:
 | 
				
			||||||
                    self.session.messages.append(
 | 
					                    self.session.messages.append(
 | 
				
			||||||
                        aic.Message(
 | 
					                        aic.Message(
 | 
				
			||||||
                            role="user",
 | 
					                            role="assistant",
 | 
				
			||||||
                            content="Tool Output: Next Steps: " + nextsteps,
 | 
					                            content=str(tool_calls),
 | 
				
			||||||
                        )
 | 
					                        )
 | 
				
			||||||
                    )
 | 
					                    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                    img = ai.compute.screenshot_to_base64(
 | 
					                    img = ai.compute.screenshot_to_base64(
 | 
				
			||||||
                        ai.compute.take_screenshot()
 | 
					                        ai.compute.take_screenshot(cross_position=click_positions)
 | 
				
			||||||
                    )
 | 
					                    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                    outputs.extend( self.process(nextsteps, img) )
 | 
					                    outputs.extend( self.process(nextsteps, img) )
 | 
				
			||||||
                return [
 | 
					                return [
 | 
				
			||||||
                    {
 | 
					                    {
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -9,7 +9,7 @@ You are CopeAI Windows Agent. You are currently controlling a Windows 11 machine
 | 
				
			|||||||
You are capable to see the screen, click buttons, type text, and interact with the system. \
 | 
					You are capable to see the screen, click buttons, type text, and interact with the system. \
 | 
				
			||||||
You will use the functions provided. The resolution of the machine is 1920x1080. \
 | 
					You will use the functions provided. The resolution of the machine is 1920x1080. \
 | 
				
			||||||
Your text response must indicate what you are doing. If the place where you clicked seems incorrect, \
 | 
					Your text response must indicate what you are doing. If the place where you clicked seems incorrect, \
 | 
				
			||||||
you will use everything you can to find the position of the location of the goal and click again."""
 | 
					you will use everything you can to find the position of the location of the goal and click again. You will see a red cross on where you previously clicked."""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
FUNCTIONS = [
 | 
					FUNCTIONS = [
 | 
				
			||||||
    {
 | 
					    {
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user