Add initial implementation of AI agent with mouse and keyboard control features

2025-05-19 00:48:14 +02:00
parent ed34ebca6a
commit 7e612c1af7
8 changed files with 320 additions and 0 deletions
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -0,0 +1,16 @@
 {
    // Use IntelliSense to learn about possible attributes.
    // Hover to view descriptions of existing attributes.
    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
    "version": "0.2.0",
    "configurations": [
        {
            "name": "Python Debugger: Current File",
            "type": "debugpy",
            "request": "launch",
            "program": "main.py",
            "console": "integratedTerminal"
        }
    ]
 }
--- a/ai/compute.py
+++ b/ai/compute.py
@@ -0,0 +1,29 @@
 import pyautogui
 from objects.inputs import MouseInput, KeyboardInput, ButtonType
 def press_mouse(mouse_input: MouseInput) -> None:
    """Presses mouse buttons at the given position."""
    x, y = mouse_input.x, mouse_input.y
    button = mouse_input.click_type
    if button == ButtonType.LEFT:
        pyautogui.click(x, y, button='left')
    elif button == ButtonType.DOUBLE_LEFT:
        pyautogui.doubleClick(x, y)
    elif button == ButtonType.RIGHT:
        pyautogui.click(x, y, button='right')
    elif button == ButtonType.MIDDLE:
        pyautogui.click(x, y, button='middle')
 def press_keyboard(keyboard_input: KeyboardInput) -> None:
    """Types the given sequence of keys."""
    text = keyboard_input.text
    if text:
        pyautogui.typewrite(text)
    if keyboard_input.press_enter:
        pyautogui.press('enter')
 def _execute(name, args):
    if name == "click_button":
        press_mouse(MouseInput(**args))
    elif name == "type_text":
        press_keyboard(KeyboardInput(**args))
--- a/ai/processor.py
+++ b/ai/processor.py
@@ -0,0 +1,73 @@
 import traceback
 import json                                           # new
 import openai
 from objects import aic
 import ai.compute
 class AIProcessor:
    def __init__(self, api_key: str, model: str = "gpt-4.1"):
        self.oai = openai.Client(api_key=api_key)
        self.model = model
        self.session = aic.Session(messages=[aic.Message(role="system", content=aic.SYSTEM_PROMPT)], model=model) # type: ignore
        self._tools_map = {                           # local binding of python callables
            "click_button": self._click_button,
            "type_text": self._type_text,
        }
    # --------------------- tool implementations --------------------- #
    def _click_button(self, x: int, y: int, click_type: str) -> str:
        # TODO: integrate real mouse automation.
        return f"Performed {click_type} click at ({x}, {y})."
    def _type_text(self, text: str) -> str:
        # TODO: integrate real typing automation.
        return f'Typed text: "{text}"'
    def _execute_tool(self, name: str, arguments: dict) -> str:
        func = self._tools_map.get(name)
        if not func:
            return f"Unknown tool: {name}"
        try:
            return func(**arguments)
        except Exception as e:
            traceback.print_exc()
            return f"Error executing {name}: {e}"
    # -------------------------- main entry -------------------------- #
    def process(self, prompt: str, img_data: str | bytes | None = None) -> str | list[dict]:
        try:
            self.session.messages.append(
                aic.Message(role="user", content=prompt, image=img_data)
            )
            response = self.oai.chat.completions.create(
                model=self.model,
                messages=self.session.messages_dict(),
                tools=aic.FUNCTIONS,  # type: ignore
            )
            # return tool call requests if any
            tool_calls = getattr(response.choices[0].message, "tool_calls", None)
            if tool_calls:
                for tc in tool_calls:
                    ai.compute._execute(
                        name=tc.function.name,
                        args=json.loads(tc.function.arguments)
                    )
                return [
                    {
                        "name": tc.function.name,
                        "arguments": json.loads(tc.function.arguments),
                    }
                    for tc in tool_calls
                ]
            # otherwise return final assistant content
            print(f"Response: {json.dumps(response.to_dict(), indent=4)}")  # debug
            output_text: str = response.choices[0].message.content  # type: ignore
            self.session.messages.append(
                aic.Message(role="assistant", content=output_text)
            )
            return output_text
        except Exception as e:
            traceback.print_exc()
            return f"Error processing request: {str(e)}"
--- a/main.py
+++ b/main.py
@@ -0,0 +1,18 @@
 from dotenv import load_dotenv
 import os
 import ai.processor
 import webserver.web
 load_dotenv()
 def main():
    aip = ai.processor.AIProcessor(
        api_key=os.getenv("OPENAI_API_KEY", ""),
        model=os.getenv("OPENAI_MODEL", "gpt-4.1")
    )
    server = webserver.web.WebServerApp(aip)
    server.run()
 if __name__ == "__main__":
    main()
--- a/objects/aic.py
+++ b/objects/aic.py
@@ -0,0 +1,97 @@
 from dataclasses import dataclass
 from typing import Optional, Union
 from typing_extensions import Literal
 compatible_models = ("gpt-4.1", "gpt-4.1-mini", 'o4-mini', 'gpt-4.1-nano')
 SYSTEM_PROMPT = """
 You are CopeAI Windows Agent. You are currently controlling a Windows 11 machine. \
 You are capable to see the screen, click buttons, type text, and interact with the system. \
 You will use the functions provided. The resolution of the machine is 1920x1080. \
 Your text response must indicate what you are doing."""
 FUNCTIONS = [
    {
        "type": "function",
        "function": {
            "name": "click_button",
            "description": "Click a button at the specified (x, y) position with the given click type.",
            "parameters": {
                "type": "object",
                "properties": {
                    "x": {
                        "type": "integer",
                        "description": "The X coordinate of the button."
                    },
                    "y": {
                        "type": "integer",
                        "description": "The Y coordinate of the button."
                    },
                    "click_type": {
                        "type": "string",
                        "enum": ["left", "double_left", "middle", "right"],
                        "description": "The type of mouse click to perform."
                    }
                },
                "required": ["click_type", "x", "y"],
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "type_text",
            "description": "Type the given text at the current cursor location.",
            "parameters": {
                "type": "object",
                "properties": {
                    "text": {
                        "type": "string",
                        "description": "The text to type."
                    },
                    "press_enter": {
                        "type": "boolean",
                        "default": False,
                        "description": "Whether to press Enter after typing the text."
                    }
                },
                "required": ["text", "press_enter"],
            }
        }
    }
 ]
@dataclass
 class Message:
    role: Literal['user', 'assistant', 'system', 'tool']       # + tool
    content: str
    image: Optional[Union[str, bytes]] = None
    disable_image: bool = False
    name: Optional[str] = None                                 # new – only for tool messages
    def to_dict(self) -> dict:
        base = {
            "role": self.role,
            "content": self.content
                      if (not self.image and not self.disable_image)
                      else [{"type": "text", "text": self.content},
                            {"type": "image_url", "image_url": {"url":f"data:image/png;base64,{self.image}"}}],
        }
        if self.role == "tool" and self.name:                 # include tool name if present
            base["name"] = self.name
        return base
@dataclass
 class Session:
    messages: list[Message]
    model: str = "gpt-4.1"
    def to_dict(self) -> dict:
        return {
            "messages": [message.to_dict() for message in self.messages],
            "model": self.model
        }
    def messages_dict(self) -> list:
        return [message.to_dict() for message in self.messages]
--- a/objects/inputs.py
+++ b/objects/inputs.py
@@ -0,0 +1,20 @@
 from dataclasses import dataclass
 from enum import Enum
 class ButtonType(Enum):
    LEFT = "left"
    DOUBLE_LEFT = "double_left"
    RIGHT = "right"
    MIDDLE = "middle"
@dataclass
 class KeyboardInput:
    text: str
    press_enter: bool = False
@dataclass
 class MouseInput:
    x: int
    y: int
    click_type: list[ButtonType]
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,7 @@
 openai
 requests
 flask
 python-dotenv
 # libraries to control mouse+keyboard+see screen
 pyautogui
 pynput
--- a/webserver/web.py
+++ b/webserver/web.py
@@ -0,0 +1,60 @@
 from flask import Flask, request, jsonify
 import os, ai.processor
 from dotenv import load_dotenv
 load_dotenv()
 class WebServerApp:
    def __init__(self, aip):
        self.app = Flask(__name__)
        self._register_routes()
        self.aip: ai.processor.AIProcessor = aip
    def _register_routes(self):
        @self.app.route('/api/test')
        def test():
            return jsonify({"message": "Hello, World!"})
        @self.app.route('/api/request', methods=['POST'])
        def handle_request():
            # sent as form-data
            data = request.form.to_dict()
            if not data:
                return jsonify({"error": "No data provided"}), 400
            # Process the data as needed
            prompt = data.get('prompt', '')
            if not prompt:
                return jsonify({"error": "No prompt provided"}), 400
            img_data = None
            if 'img' in request.files:
                img_file = request.files['img']
                if img_file:
                    img_data = img_file.read()
                else:
                    img_data = None
            import base64
            # Convert image data to base64 if provided
            if img_data and isinstance(img_data, bytes):
                img_data = base64.b64encode(img_data).decode('utf-8')
            elif img_data and isinstance(img_data, str):
                img_data = img_data.encode('utf-8')                
            response = self.aip.process(prompt, img_data)
            return jsonify({"response": response}), 200
        @self.app.route('/api/health')
        def health():
            return jsonify({"status": "healthy"})
    def run(self, *args, **kwargs):
        self.app.run(*args, **kwargs)
 # Example usage:
 # if __name__ == "__main__":
 #     server = WebServerApp()
 #     server.run()