Add initial implementation of AI agent with mouse and keyboard control features

2025-05-19 00:48:14 +02:00
parent ed34ebca6a
commit 7e612c1af7
8 changed files with 320 additions and 0 deletions
--- a/objects/aic.py
+++ b/objects/aic.py
@@ -0,0 +1,97 @@
+from dataclasses import dataclass
+from typing import Optional, Union
+from typing_extensions import Literal
+
+compatible_models = ("gpt-4.1", "gpt-4.1-mini", 'o4-mini', 'gpt-4.1-nano')
+
+SYSTEM_PROMPT = """
+You are CopeAI Windows Agent. You are currently controlling a Windows 11 machine. \
+You are capable to see the screen, click buttons, type text, and interact with the system. \
+You will use the functions provided. The resolution of the machine is 1920x1080. \
+Your text response must indicate what you are doing."""
+
+FUNCTIONS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "click_button",
+            "description": "Click a button at the specified (x, y) position with the given click type.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "x": {
+                        "type": "integer",
+                        "description": "The X coordinate of the button."
+                    },
+                    "y": {
+                        "type": "integer",
+                        "description": "The Y coordinate of the button."
+                    },
+                    "click_type": {
+                        "type": "string",
+                        "enum": ["left", "double_left", "middle", "right"],
+                        "description": "The type of mouse click to perform."
+                    }
+                },
+                "required": ["click_type", "x", "y"],
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "type_text",
+            "description": "Type the given text at the current cursor location.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "text": {
+                        "type": "string",
+                        "description": "The text to type."
+                    },
+                    "press_enter": {
+                        "type": "boolean",
+                        "default": False,
+                        "description": "Whether to press Enter after typing the text."
+                    }
+                },
+                "required": ["text", "press_enter"],
+            }
+        }
+    }
+]
+
+
+@dataclass
+class Message:
+    role: Literal['user', 'assistant', 'system', 'tool']       # + tool
+    content: str
+    image: Optional[Union[str, bytes]] = None
+    disable_image: bool = False
+    name: Optional[str] = None                                 # new – only for tool messages
+
+    def to_dict(self) -> dict:
+        base = {
+            "role": self.role,
+            "content": self.content
+                      if (not self.image and not self.disable_image)
+                      else [{"type": "text", "text": self.content},
+                            {"type": "image_url", "image_url": {"url":f"data:image/png;base64,{self.image}"}}],
+        }
+        if self.role == "tool" and self.name:                 # include tool name if present
+            base["name"] = self.name
+        return base
+
+@dataclass
+class Session:
+    messages: list[Message]
+    model: str = "gpt-4.1"
+
+    def to_dict(self) -> dict:
+        return {
+            "messages": [message.to_dict() for message in self.messages],
+            "model": self.model
+        }
+
+    def messages_dict(self) -> list:
+        return [message.to_dict() for message in self.messages]
--- a/objects/inputs.py
+++ b/objects/inputs.py
@@ -0,0 +1,20 @@
+from dataclasses import dataclass
+from enum import Enum
+
+class ButtonType(Enum):
+    LEFT = "left"
+    DOUBLE_LEFT = "double_left"
+    RIGHT = "right"
+    MIDDLE = "middle"
+
+@dataclass
+class KeyboardInput:
+    text: str
+    press_enter: bool = False
+
+@dataclass
+class MouseInput:
+    x: int
+    y: int
+    click_type: list[ButtonType]
+