fix: Enhance screenshot functionality; add crosshair drawing and save screenshot to file

2025-05-19 13:39:26 +02:00
parent d7c4f9b0cb
commit b583094e20
3 changed files with 33 additions and 6 deletions
--- a/ai/compute.py
+++ b/ai/compute.py
@@ -3,13 +3,29 @@ import threading
 import time, io, base64
 import sys
 from objects.inputs import MouseInput, KeyboardInput, ButtonType
-from PIL import ImageGrab  # type: ignore
+from PIL import ImageGrab, ImageDraw  # type: ignore

-def take_screenshot() -> bytes:
+def take_screenshot(cross_position: list[tuple[int, int]] | None = None) -> bytes:
    """Take a screenshot of the current screen and return it as bytes."""
    screenshot = ImageGrab.grab()
    buf = io.BytesIO()
+
+    # Optionally draw a crosshair at the specified position
+    if cross_position:
+        for pos in cross_position:
+            x, y = pos
+            draw = ImageDraw.Draw(screenshot)
+            size = 20      # half‐length of each arm
+            color = (255, 0, 0)
+            width = 2
+            # horizontal line
+            draw.line((x - size, y, x + size, y), fill=color, width=width)
+            # vertical line
+            draw.line((x, y - size, x, y + size), fill=color, width=width)
+
    screenshot.save(buf, format='PNG')
+    # save in a file
+    screenshot.save("screenshot.png", format='PNG')
    return buf.getvalue()

 def screenshot_to_base64(screenshot: bytes) -> str:
--- a/ai/processor.py
+++ b/ai/processor.py
@@ -42,6 +42,7 @@ class AIProcessor:
    def process(self, prompt: str, img_data: str | bytes | None = None) -> list[str | dict]:
        outputs = []  # type: list[str | dict]
        reexec = True
+        click_positions = []  # used for screenshot crosshair position
        nextsteps = ""
        try:
            self.session.messages.append(
@@ -69,6 +70,14 @@ class AIProcessor:
                        except:
                            nextsteps = str(tc.function.arguments)
                            print('ERROR NEXT STEPS IS STR, ', nextsteps)
+                    if tc.function.name == "click_button":
+                        # extract click position for screenshot crosshair
+                        click_positions.extend(tuple( # button_type, x, y
+                            map(int,(tc.function.arguments.get("x", 0),
+                                    tc.function.arguments.get("y", 0)
+                                )
+                            )
+                        ))
                    r = ai.compute._execute(
                        name=tc.function.name,
                        args=json.loads(tc.function.arguments),
@@ -96,15 +105,17 @@ class AIProcessor:
                if reexec:
                    self.session.messages.append(
                        aic.Message(
-                            role="user",
-                            content="Tool Output: Next Steps: " + nextsteps,
+                            role="assistant",
+                            content=str(tool_calls),
                        )
                    )

                    img = ai.compute.screenshot_to_base64(
-                        ai.compute.take_screenshot()
+                        ai.compute.take_screenshot(cross_position=click_positions)
                    )

+
+
                    outputs.extend( self.process(nextsteps, img) )
                return [
                    {