Add initial implementation of AI agent with mouse and keyboard control features

2025-05-19 00:48:14 +02:00
parent ed34ebca6a
commit 7e612c1af7
8 changed files with 320 additions and 0 deletions
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -0,0 +1,16 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        
+        {
+            "name": "Python Debugger: Current File",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "main.py",
+            "console": "integratedTerminal"
+        }
+    ]
+}
--- a/ai/compute.py
+++ b/ai/compute.py
@@ -0,0 +1,29 @@
+import pyautogui
+from objects.inputs import MouseInput, KeyboardInput, ButtonType
+
+def press_mouse(mouse_input: MouseInput) -> None:
+    """Presses mouse buttons at the given position."""
+    x, y = mouse_input.x, mouse_input.y
+    button = mouse_input.click_type
+    if button == ButtonType.LEFT:
+        pyautogui.click(x, y, button='left')
+    elif button == ButtonType.DOUBLE_LEFT:
+        pyautogui.doubleClick(x, y)
+    elif button == ButtonType.RIGHT:
+        pyautogui.click(x, y, button='right')
+    elif button == ButtonType.MIDDLE:
+        pyautogui.click(x, y, button='middle')
+
+def press_keyboard(keyboard_input: KeyboardInput) -> None:
+    """Types the given sequence of keys."""
+    text = keyboard_input.text
+    if text:
+        pyautogui.typewrite(text)
+    if keyboard_input.press_enter:
+        pyautogui.press('enter')
+
+def _execute(name, args):
+    if name == "click_button":
+        press_mouse(MouseInput(**args))
+    elif name == "type_text":
+        press_keyboard(KeyboardInput(**args))
--- a/ai/processor.py
+++ b/ai/processor.py
@@ -0,0 +1,73 @@
+import traceback
+import json                                           # new
+import openai
+from objects import aic
+import ai.compute
+
+class AIProcessor:
+    def __init__(self, api_key: str, model: str = "gpt-4.1"):
+        self.oai = openai.Client(api_key=api_key)
+        self.model = model
+        self.session = aic.Session(messages=[aic.Message(role="system", content=aic.SYSTEM_PROMPT)], model=model) # type: ignore
+        self._tools_map = {                           # local binding of python callables
+            "click_button": self._click_button,
+            "type_text": self._type_text,
+        }
+
+    # --------------------- tool implementations --------------------- #
+    def _click_button(self, x: int, y: int, click_type: str) -> str:
+        # TODO: integrate real mouse automation.
+        return f"Performed {click_type} click at ({x}, {y})."
+
+    def _type_text(self, text: str) -> str:
+        # TODO: integrate real typing automation.
+        return f'Typed text: "{text}"'
+
+    def _execute_tool(self, name: str, arguments: dict) -> str:
+        func = self._tools_map.get(name)
+        if not func:
+            return f"Unknown tool: {name}"
+        try:
+            return func(**arguments)
+        except Exception as e:
+            traceback.print_exc()
+            return f"Error executing {name}: {e}"
+
+    # -------------------------- main entry -------------------------- #
+    def process(self, prompt: str, img_data: str | bytes | None = None) -> str | list[dict]:
+        try:
+            self.session.messages.append(
+                aic.Message(role="user", content=prompt, image=img_data)
+            )
+            response = self.oai.chat.completions.create(
+                model=self.model,
+                messages=self.session.messages_dict(),
+                tools=aic.FUNCTIONS,  # type: ignore
+            )
+
+            # return tool call requests if any
+            tool_calls = getattr(response.choices[0].message, "tool_calls", None)
+            if tool_calls:
+                for tc in tool_calls:
+                    ai.compute._execute(
+                        name=tc.function.name,
+                        args=json.loads(tc.function.arguments)
+                    )
+                return [
+                    {
+                        "name": tc.function.name,
+                        "arguments": json.loads(tc.function.arguments),
+                    }
+                    for tc in tool_calls
+                ]
+
+            # otherwise return final assistant content
+            print(f"Response: {json.dumps(response.to_dict(), indent=4)}")  # debug
+            output_text: str = response.choices[0].message.content  # type: ignore
+            self.session.messages.append(
+                aic.Message(role="assistant", content=output_text)
+            )
+            return output_text
+        except Exception as e:
+            traceback.print_exc()
+            return f"Error processing request: {str(e)}"
--- a/main.py
+++ b/main.py
@@ -0,0 +1,18 @@
+from dotenv import load_dotenv
+import os
+
+import ai.processor
+import webserver.web
+
+load_dotenv()
+
+def main():
+    aip = ai.processor.AIProcessor(
+        api_key=os.getenv("OPENAI_API_KEY", ""),
+        model=os.getenv("OPENAI_MODEL", "gpt-4.1")
+    )
+    server = webserver.web.WebServerApp(aip)
+    server.run()
+
+if __name__ == "__main__":
+    main()
--- a/objects/aic.py
+++ b/objects/aic.py
@@ -0,0 +1,97 @@
+from dataclasses import dataclass
+from typing import Optional, Union
+from typing_extensions import Literal
+
+compatible_models = ("gpt-4.1", "gpt-4.1-mini", 'o4-mini', 'gpt-4.1-nano')
+
+SYSTEM_PROMPT = """
+You are CopeAI Windows Agent. You are currently controlling a Windows 11 machine. \
+You are capable to see the screen, click buttons, type text, and interact with the system. \
+You will use the functions provided. The resolution of the machine is 1920x1080. \
+Your text response must indicate what you are doing."""
+
+FUNCTIONS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "click_button",
+            "description": "Click a button at the specified (x, y) position with the given click type.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "x": {
+                        "type": "integer",
+                        "description": "The X coordinate of the button."
+                    },
+                    "y": {
+                        "type": "integer",
+                        "description": "The Y coordinate of the button."
+                    },
+                    "click_type": {
+                        "type": "string",
+                        "enum": ["left", "double_left", "middle", "right"],
+                        "description": "The type of mouse click to perform."
+                    }
+                },
+                "required": ["click_type", "x", "y"],
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "type_text",
+            "description": "Type the given text at the current cursor location.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "text": {
+                        "type": "string",
+                        "description": "The text to type."
+                    },
+                    "press_enter": {
+                        "type": "boolean",
+                        "default": False,
+                        "description": "Whether to press Enter after typing the text."
+                    }
+                },
+                "required": ["text", "press_enter"],
+            }
+        }
+    }
+]
+
+
+@dataclass
+class Message:
+    role: Literal['user', 'assistant', 'system', 'tool']       # + tool
+    content: str
+    image: Optional[Union[str, bytes]] = None
+    disable_image: bool = False
+    name: Optional[str] = None                                 # new – only for tool messages
+
+    def to_dict(self) -> dict:
+        base = {
+            "role": self.role,
+            "content": self.content
+                      if (not self.image and not self.disable_image)
+                      else [{"type": "text", "text": self.content},
+                            {"type": "image_url", "image_url": {"url":f"data:image/png;base64,{self.image}"}}],
+        }
+        if self.role == "tool" and self.name:                 # include tool name if present
+            base["name"] = self.name
+        return base
+
+@dataclass
+class Session:
+    messages: list[Message]
+    model: str = "gpt-4.1"
+
+    def to_dict(self) -> dict:
+        return {
+            "messages": [message.to_dict() for message in self.messages],
+            "model": self.model
+        }
+
+    def messages_dict(self) -> list:
+        return [message.to_dict() for message in self.messages]
--- a/objects/inputs.py
+++ b/objects/inputs.py
@@ -0,0 +1,20 @@
+from dataclasses import dataclass
+from enum import Enum
+
+class ButtonType(Enum):
+    LEFT = "left"
+    DOUBLE_LEFT = "double_left"
+    RIGHT = "right"
+    MIDDLE = "middle"
+
+@dataclass
+class KeyboardInput:
+    text: str
+    press_enter: bool = False
+
+@dataclass
+class MouseInput:
+    x: int
+    y: int
+    click_type: list[ButtonType]
+
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,7 @@
+openai
+requests
+flask
+python-dotenv
+# libraries to control mouse+keyboard+see screen
+pyautogui
+pynput
--- a/webserver/web.py
+++ b/webserver/web.py
@@ -0,0 +1,60 @@
+from flask import Flask, request, jsonify
+import os, ai.processor
+from dotenv import load_dotenv
+
+load_dotenv()
+
+class WebServerApp:
+    def __init__(self, aip):
+        self.app = Flask(__name__)
+        self._register_routes()
+        self.aip: ai.processor.AIProcessor = aip
+
+    def _register_routes(self):
+        @self.app.route('/api/test')
+        def test():
+            return jsonify({"message": "Hello, World!"})
+
+        @self.app.route('/api/request', methods=['POST'])
+        def handle_request():
+            # sent as form-data
+            data = request.form.to_dict()
+            if not data:
+                return jsonify({"error": "No data provided"}), 400
+            
+            # Process the data as needed
+            prompt = data.get('prompt', '')
+
+
+
+            if not prompt:
+                return jsonify({"error": "No prompt provided"}), 400
+            img_data = None
+            if 'img' in request.files:
+                img_file = request.files['img']
+                if img_file:
+                    img_data = img_file.read()
+                else:
+                    img_data = None
+            
+            import base64
+            # Convert image data to base64 if provided
+            if img_data and isinstance(img_data, bytes):
+                img_data = base64.b64encode(img_data).decode('utf-8')
+            elif img_data and isinstance(img_data, str):
+                img_data = img_data.encode('utf-8')                
+            
+            response = self.aip.process(prompt, img_data)
+            return jsonify({"response": response}), 200
+
+        @self.app.route('/api/health')
+        def health():
+            return jsonify({"status": "healthy"})
+
+    def run(self, *args, **kwargs):
+        self.app.run(*args, **kwargs)
+
+# Example usage:
+# if __name__ == "__main__":
+#     server = WebServerApp()
+#     server.run()