Add screenshot functionality and new commands for wait and reprompt
This commit is contained in:
		@@ -1,8 +1,20 @@
 | 
			
		||||
import pyautogui
 | 
			
		||||
import threading
 | 
			
		||||
import time
 | 
			
		||||
import time, io, base64
 | 
			
		||||
import tkinter as tk
 | 
			
		||||
from objects.inputs import MouseInput, KeyboardInput, ButtonType
 | 
			
		||||
from PIL import ImageGrab  # type: ignore
 | 
			
		||||
 | 
			
		||||
def take_screenshot() -> bytes:
 | 
			
		||||
    """Take a screenshot of the current screen and return it as bytes."""
 | 
			
		||||
    screenshot = ImageGrab.grab()
 | 
			
		||||
    buf = io.BytesIO()
 | 
			
		||||
    screenshot.save(buf, format='PNG')
 | 
			
		||||
    return buf.getvalue()
 | 
			
		||||
 | 
			
		||||
def screenshot_to_base64(screenshot: bytes) -> str:
 | 
			
		||||
    """Convert screenshot bytes to a base64 encoded string."""
 | 
			
		||||
    return base64.b64encode(screenshot).decode('utf-8')
 | 
			
		||||
 | 
			
		||||
def show_click_indicator(x: int, y: int, duration: float = 2.0, size: int = 50) -> None:
 | 
			
		||||
    """Display a red circle at (x, y) for the given duration."""
 | 
			
		||||
@@ -51,8 +63,22 @@ def press_keyboard(keyboard_input: KeyboardInput) -> None:
 | 
			
		||||
    if keyboard_input.press_enter:
 | 
			
		||||
        pyautogui.press('enter')
 | 
			
		||||
 | 
			
		||||
def _execute(name, args):
 | 
			
		||||
def wait(duration: float) -> None:
 | 
			
		||||
    """Waits for the specified duration in seconds."""
 | 
			
		||||
    time.sleep(duration)
 | 
			
		||||
 | 
			
		||||
def reprompt(nextsteps: str, processor) -> None:
 | 
			
		||||
    """Re-execute GPT and take a new screenshot."""
 | 
			
		||||
    scr = screenshot_to_base64(take_screenshot())
 | 
			
		||||
    return processor.process(nextsteps, img_data=scr)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _execute(name, args, processor):
 | 
			
		||||
    if name == "click_button":
 | 
			
		||||
        press_mouse(MouseInput(**args))
 | 
			
		||||
    elif name == "type_text":
 | 
			
		||||
        press_keyboard(KeyboardInput(**args))
 | 
			
		||||
    elif name == "wait":
 | 
			
		||||
        wait(**args)
 | 
			
		||||
    elif name == "reprompt":
 | 
			
		||||
        reprompt(**args, processor=processor)
 | 
			
		||||
 
 | 
			
		||||
@@ -1,6 +1,7 @@
 | 
			
		||||
import traceback
 | 
			
		||||
import json                                           # new
 | 
			
		||||
import json
 | 
			
		||||
import openai
 | 
			
		||||
from flask import jsonify
 | 
			
		||||
from objects import aic
 | 
			
		||||
import ai.compute
 | 
			
		||||
 | 
			
		||||
@@ -34,7 +35,8 @@ class AIProcessor:
 | 
			
		||||
            return f"Error executing {name}: {e}"
 | 
			
		||||
 | 
			
		||||
    # -------------------------- main entry -------------------------- #
 | 
			
		||||
    def process(self, prompt: str, img_data: str | bytes | None = None) -> str | list[dict]:
 | 
			
		||||
    def process(self, prompt: str, img_data: str | bytes | None = None) -> str | list[str | dict]:
 | 
			
		||||
        outputs = []  # type: list[str | dict]
 | 
			
		||||
        try:
 | 
			
		||||
            self.session.messages.append(
 | 
			
		||||
                aic.Message(role="user", content=prompt, image=img_data)
 | 
			
		||||
@@ -49,10 +51,12 @@ class AIProcessor:
 | 
			
		||||
            tool_calls = getattr(response.choices[0].message, "tool_calls", None)
 | 
			
		||||
            if tool_calls:
 | 
			
		||||
                for tc in tool_calls:
 | 
			
		||||
                    ai.compute._execute(
 | 
			
		||||
                    r = ai.compute._execute(
 | 
			
		||||
                        name=tc.function.name,
 | 
			
		||||
                        args=json.loads(tc.function.arguments)
 | 
			
		||||
                        args=json.loads(tc.function.arguments),
 | 
			
		||||
                        processor=self,
 | 
			
		||||
                    )
 | 
			
		||||
                    outputs.append(r) if r else None
 | 
			
		||||
                return [
 | 
			
		||||
                    {
 | 
			
		||||
                        "name": tc.function.name,
 | 
			
		||||
@@ -64,10 +68,11 @@ class AIProcessor:
 | 
			
		||||
            # otherwise return final assistant content
 | 
			
		||||
            print(f"Response: {json.dumps(response.to_dict(), indent=4)}")  # debug
 | 
			
		||||
            output_text: str = response.choices[0].message.content  # type: ignore
 | 
			
		||||
            outputs.append(output_text)
 | 
			
		||||
            self.session.messages.append(
 | 
			
		||||
                aic.Message(role="assistant", content=output_text)
 | 
			
		||||
            )
 | 
			
		||||
            return output_text
 | 
			
		||||
            return outputs
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            traceback.print_exc()
 | 
			
		||||
            return f"Error processing request: {str(e)}"
 | 
			
		||||
 
 | 
			
		||||
@@ -58,6 +58,40 @@ FUNCTIONS = [
 | 
			
		||||
                "required": ["text", "press_enter"],
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "type": "function",
 | 
			
		||||
        "function": {
 | 
			
		||||
            "name": "wait",
 | 
			
		||||
            "description": "Wait for a specified amount of time.",
 | 
			
		||||
            "parameters": {
 | 
			
		||||
                "type": "object",
 | 
			
		||||
                "properties": {
 | 
			
		||||
                    "duration": {
 | 
			
		||||
                        "type": "number",
 | 
			
		||||
                        "description": "The duration to wait in seconds."
 | 
			
		||||
                    }
 | 
			
		||||
                },
 | 
			
		||||
                "required": ["duration"],
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "type": "function",
 | 
			
		||||
        "function": {
 | 
			
		||||
            "name": "reprompt",
 | 
			
		||||
            "description": "After executing what you asked for, re-perform a screenshot to determine the next steps. Best combined with a wait.",
 | 
			
		||||
            "parameters": {
 | 
			
		||||
                "type": "object",
 | 
			
		||||
                "properties": {
 | 
			
		||||
                    "nextsteps": {
 | 
			
		||||
                        "type": "string",
 | 
			
		||||
                        "description": "The next steps to take after the screenshot."
 | 
			
		||||
                    }
 | 
			
		||||
                },
 | 
			
		||||
                "required": ["nextsteps"],
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -5,4 +5,6 @@ python-dotenv
 | 
			
		||||
# libraries to control mouse+keyboard+see screen
 | 
			
		||||
pyautogui
 | 
			
		||||
pynput
 | 
			
		||||
Pillow
 | 
			
		||||
pillow
 | 
			
		||||
 | 
			
		||||
# --index-url https://mirrors.sustech.edu.cn/pypi/simple
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user