Add screenshot functionality and new commands for wait and reprompt

2025-05-19 09:15:08 +02:00 · 2025-05-19 09:15:08 +02:00 · f7feb12946
commit f7feb12946
parent 66330bfc73
4 changed files with 75 additions and 8 deletions
--- a/ai/compute.py
+++ b/ai/compute.py
@ -1,8 +1,20 @@
 import pyautogui
 import threading
-import time
+import time, io, base64
 import tkinter as tk
 from objects.inputs import MouseInput, KeyboardInput, ButtonType
+from PIL import ImageGrab  # type: ignore
+
+def take_screenshot() -> bytes:
+    """Take a screenshot of the current screen and return it as bytes."""
+    screenshot = ImageGrab.grab()
+    buf = io.BytesIO()
+    screenshot.save(buf, format='PNG')
+    return buf.getvalue()
+
+def screenshot_to_base64(screenshot: bytes) -> str:
+    """Convert screenshot bytes to a base64 encoded string."""
+    return base64.b64encode(screenshot).decode('utf-8')

 def show_click_indicator(x: int, y: int, duration: float = 2.0, size: int = 50) -> None:
    """Display a red circle at (x, y) for the given duration."""
@ -51,8 +63,22 @@ def press_keyboard(keyboard_input: KeyboardInput) -> None:
    if keyboard_input.press_enter:
        pyautogui.press('enter')

-def _execute(name, args):
+def wait(duration: float) -> None:
+    """Waits for the specified duration in seconds."""
+    time.sleep(duration)
+
+def reprompt(nextsteps: str, processor) -> None:
+    """Re-execute GPT and take a new screenshot."""
+    scr = screenshot_to_base64(take_screenshot())
+    return processor.process(nextsteps, img_data=scr)
+
+
+def _execute(name, args, processor):
    if name == "click_button":
        press_mouse(MouseInput(**args))
    elif name == "type_text":
        press_keyboard(KeyboardInput(**args))
+    elif name == "wait":
+        wait(**args)
+    elif name == "reprompt":
+        reprompt(**args, processor=processor)
--- a/ai/processor.py
+++ b/ai/processor.py
@ -1,6 +1,7 @@
 import traceback
-import json                                           # new
+import json
 import openai
+from flask import jsonify
 from objects import aic
 import ai.compute

@ -34,7 +35,8 @@ class AIProcessor:
            return f"Error executing {name}: {e}"

    # -------------------------- main entry -------------------------- #
-    def process(self, prompt: str, img_data: str | bytes | None = None) -> str | list[dict]:
+    def process(self, prompt: str, img_data: str | bytes | None = None) -> str | list[str | dict]:
+        outputs = []  # type: list[str | dict]
        try:
            self.session.messages.append(
                aic.Message(role="user", content=prompt, image=img_data)
@ -49,10 +51,12 @@ class AIProcessor:
            tool_calls = getattr(response.choices[0].message, "tool_calls", None)
            if tool_calls:
                for tc in tool_calls:
-                    ai.compute._execute(
+                    r = ai.compute._execute(
                        name=tc.function.name,
-                        args=json.loads(tc.function.arguments)
+                        args=json.loads(tc.function.arguments),
+                        processor=self,
                    )
+                    outputs.append(r) if r else None
                return [
                    {
                        "name": tc.function.name,
@ -64,10 +68,11 @@ class AIProcessor:
            # otherwise return final assistant content
            print(f"Response: {json.dumps(response.to_dict(), indent=4)}")  # debug
            output_text: str = response.choices[0].message.content  # type: ignore
+            outputs.append(output_text)
            self.session.messages.append(
                aic.Message(role="assistant", content=output_text)
            )
-            return output_text
+            return outputs
        except Exception as e:
            traceback.print_exc()
            return f"Error processing request: {str(e)}"
--- a/objects/aic.py
+++ b/objects/aic.py
@ -58,6 +58,40 @@ FUNCTIONS = [
                "required": ["text", "press_enter"],
            }
        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "wait",
+            "description": "Wait for a specified amount of time.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "duration": {
+                        "type": "number",
+                        "description": "The duration to wait in seconds."
+                    }
+                },
+                "required": ["duration"],
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "reprompt",
+            "description": "After executing what you asked for, re-perform a screenshot to determine the next steps. Best combined with a wait.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "nextsteps": {
+                        "type": "string",
+                        "description": "The next steps to take after the screenshot."
+                    }
+                },
+                "required": ["nextsteps"],
+            }
+        }
    }
 ]

--- a/requirements.txt
+++ b/requirements.txt
@ -5,4 +5,6 @@ python-dotenv
 # libraries to control mouse+keyboard+see screen
 pyautogui
 pynput
-Pillow
+pillow
+
+# --index-url https://mirrors.sustech.edu.cn/pypi/simple