Add initial implementation of AI agent with mouse and keyboard control features

This commit is contained in:
2025-05-19 00:48:14 +02:00
parent ed34ebca6a
commit 7e612c1af7
8 changed files with 320 additions and 0 deletions

97
objects/aic.py Normal file
View File

@@ -0,0 +1,97 @@
from dataclasses import dataclass
from typing import Optional, Union
from typing_extensions import Literal
compatible_models = ("gpt-4.1", "gpt-4.1-mini", 'o4-mini', 'gpt-4.1-nano')
SYSTEM_PROMPT = """
You are CopeAI Windows Agent. You are currently controlling a Windows 11 machine. \
You are capable to see the screen, click buttons, type text, and interact with the system. \
You will use the functions provided. The resolution of the machine is 1920x1080. \
Your text response must indicate what you are doing."""
FUNCTIONS = [
{
"type": "function",
"function": {
"name": "click_button",
"description": "Click a button at the specified (x, y) position with the given click type.",
"parameters": {
"type": "object",
"properties": {
"x": {
"type": "integer",
"description": "The X coordinate of the button."
},
"y": {
"type": "integer",
"description": "The Y coordinate of the button."
},
"click_type": {
"type": "string",
"enum": ["left", "double_left", "middle", "right"],
"description": "The type of mouse click to perform."
}
},
"required": ["click_type", "x", "y"],
}
}
},
{
"type": "function",
"function": {
"name": "type_text",
"description": "Type the given text at the current cursor location.",
"parameters": {
"type": "object",
"properties": {
"text": {
"type": "string",
"description": "The text to type."
},
"press_enter": {
"type": "boolean",
"default": False,
"description": "Whether to press Enter after typing the text."
}
},
"required": ["text", "press_enter"],
}
}
}
]
@dataclass
class Message:
role: Literal['user', 'assistant', 'system', 'tool'] # + tool
content: str
image: Optional[Union[str, bytes]] = None
disable_image: bool = False
name: Optional[str] = None # new only for tool messages
def to_dict(self) -> dict:
base = {
"role": self.role,
"content": self.content
if (not self.image and not self.disable_image)
else [{"type": "text", "text": self.content},
{"type": "image_url", "image_url": {"url":f"data:image/png;base64,{self.image}"}}],
}
if self.role == "tool" and self.name: # include tool name if present
base["name"] = self.name
return base
@dataclass
class Session:
messages: list[Message]
model: str = "gpt-4.1"
def to_dict(self) -> dict:
return {
"messages": [message.to_dict() for message in self.messages],
"model": self.model
}
def messages_dict(self) -> list:
return [message.to_dict() for message in self.messages]

20
objects/inputs.py Normal file
View File

@@ -0,0 +1,20 @@
from dataclasses import dataclass
from enum import Enum
class ButtonType(Enum):
LEFT = "left"
DOUBLE_LEFT = "double_left"
RIGHT = "right"
MIDDLE = "middle"
@dataclass
class KeyboardInput:
text: str
press_enter: bool = False
@dataclass
class MouseInput:
x: int
y: int
click_type: list[ButtonType]