diff --git a/ai/compute.py b/ai/compute.py index 5a1dfa7..78fddfc 100644 --- a/ai/compute.py +++ b/ai/compute.py @@ -1,8 +1,20 @@ import pyautogui import threading -import time +import time, io, base64 import tkinter as tk from objects.inputs import MouseInput, KeyboardInput, ButtonType +from PIL import ImageGrab # type: ignore + +def take_screenshot() -> bytes: + """Take a screenshot of the current screen and return it as bytes.""" + screenshot = ImageGrab.grab() + buf = io.BytesIO() + screenshot.save(buf, format='PNG') + return buf.getvalue() + +def screenshot_to_base64(screenshot: bytes) -> str: + """Convert screenshot bytes to a base64 encoded string.""" + return base64.b64encode(screenshot).decode('utf-8') def show_click_indicator(x: int, y: int, duration: float = 2.0, size: int = 50) -> None: """Display a red circle at (x, y) for the given duration.""" @@ -51,8 +63,22 @@ def press_keyboard(keyboard_input: KeyboardInput) -> None: if keyboard_input.press_enter: pyautogui.press('enter') -def _execute(name, args): +def wait(duration: float) -> None: + """Waits for the specified duration in seconds.""" + time.sleep(duration) + +def reprompt(nextsteps: str, processor) -> None: + """Re-execute GPT and take a new screenshot.""" + scr = screenshot_to_base64(take_screenshot()) + return processor.process(nextsteps, img_data=scr) + + +def _execute(name, args, processor): if name == "click_button": press_mouse(MouseInput(**args)) elif name == "type_text": press_keyboard(KeyboardInput(**args)) + elif name == "wait": + wait(**args) + elif name == "reprompt": + reprompt(**args, processor=processor) diff --git a/ai/processor.py b/ai/processor.py index e90d548..5b9c6be 100644 --- a/ai/processor.py +++ b/ai/processor.py @@ -1,6 +1,7 @@ import traceback -import json # new +import json import openai +from flask import jsonify from objects import aic import ai.compute @@ -34,7 +35,8 @@ class AIProcessor: return f"Error executing {name}: {e}" # -------------------------- main entry -------------------------- # - def process(self, prompt: str, img_data: str | bytes | None = None) -> str | list[dict]: + def process(self, prompt: str, img_data: str | bytes | None = None) -> str | list[str | dict]: + outputs = [] # type: list[str | dict] try: self.session.messages.append( aic.Message(role="user", content=prompt, image=img_data) @@ -49,10 +51,12 @@ class AIProcessor: tool_calls = getattr(response.choices[0].message, "tool_calls", None) if tool_calls: for tc in tool_calls: - ai.compute._execute( + r = ai.compute._execute( name=tc.function.name, - args=json.loads(tc.function.arguments) + args=json.loads(tc.function.arguments), + processor=self, ) + outputs.append(r) if r else None return [ { "name": tc.function.name, @@ -64,10 +68,11 @@ class AIProcessor: # otherwise return final assistant content print(f"Response: {json.dumps(response.to_dict(), indent=4)}") # debug output_text: str = response.choices[0].message.content # type: ignore + outputs.append(output_text) self.session.messages.append( aic.Message(role="assistant", content=output_text) ) - return output_text + return outputs except Exception as e: traceback.print_exc() return f"Error processing request: {str(e)}" diff --git a/objects/aic.py b/objects/aic.py index eddd02c..784d1b2 100644 --- a/objects/aic.py +++ b/objects/aic.py @@ -58,6 +58,40 @@ FUNCTIONS = [ "required": ["text", "press_enter"], } } + }, + { + "type": "function", + "function": { + "name": "wait", + "description": "Wait for a specified amount of time.", + "parameters": { + "type": "object", + "properties": { + "duration": { + "type": "number", + "description": "The duration to wait in seconds." + } + }, + "required": ["duration"], + } + } + }, + { + "type": "function", + "function": { + "name": "reprompt", + "description": "After executing what you asked for, re-perform a screenshot to determine the next steps. Best combined with a wait.", + "parameters": { + "type": "object", + "properties": { + "nextsteps": { + "type": "string", + "description": "The next steps to take after the screenshot." + } + }, + "required": ["nextsteps"], + } + } } ] diff --git a/requirements.txt b/requirements.txt index 5bb1cad..dca86e2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,6 @@ python-dotenv # libraries to control mouse+keyboard+see screen pyautogui pynput -Pillow +pillow + +# --index-url https://mirrors.sustech.edu.cn/pypi/simple