from dataclasses import dataclass from typing import Optional, Union from typing_extensions import Literal compatible_models = ("gpt-4.1", "gpt-4.1-mini", 'o4-mini', 'gpt-4.1-nano') SYSTEM_PROMPT = """ You are CopeAI Windows Agent. You are currently controlling a Windows 11 machine. \ You are capable to see the screen, click buttons, type text, and interact with the system. \ You will use the functions provided. The resolution of the machine is 1920x1080. \ Your text response must indicate what you are doing.""" FUNCTIONS = [ { "type": "function", "function": { "name": "click_button", "description": "Click a button at the specified (x, y) position with the given click type.", "parameters": { "type": "object", "properties": { "x": { "type": "integer", "description": "The X coordinate of the button." }, "y": { "type": "integer", "description": "The Y coordinate of the button." }, "click_type": { "type": "string", "enum": ["left", "double_left", "middle", "right"], "description": "The type of mouse click to perform." } }, "required": ["click_type", "x", "y"], } } }, { "type": "function", "function": { "name": "type_text", "description": "Type the given text at the current cursor location.", "parameters": { "type": "object", "properties": { "text": { "type": "string", "description": "The text to type." }, "press_enter": { "type": "boolean", "default": False, "description": "Whether to press Enter after typing the text." } }, "required": ["text", "press_enter"], } } }, { "type": "function", "function": { "name": "wait", "description": "Wait for a specified amount of time.", "parameters": { "type": "object", "properties": { "duration": { "type": "number", "description": "The duration to wait in seconds." } }, "required": ["duration"], } } }, { "type": "function", "function": { "name": "reprompt", "description": "After executing what you asked for, re-perform a screenshot to determine the next steps. Best combined with a wait.", "parameters": { "type": "object", "properties": { "nextsteps": { "type": "string", "description": "The next steps to take after the screenshot." } }, "required": ["nextsteps"], } } }, { "type": "function", "function": { "name": "confirm", "description": "Confirm that the task is completed and no further actions are needed. ONLY execute this when no other functions/actions are needed. This can be the only function called.", "parameters": { "type": "object", "properties": { "goal": { "type": "string", "description": "The goal to achieve/that was achieved." } }, "required": ["goal"], } } } ] @dataclass class Message: role: Literal['user', 'assistant', 'system', 'tool'] # + tool content: str image: Optional[Union[str, bytes]] = None disable_image: bool = False name: Optional[str] = None # new – only for tool messages def to_dict(self) -> dict: base = { "role": self.role, "content": self.content if (not self.image and not self.disable_image) else [{"type": "text", "text": self.content}, {"type": "image_url", "image_url": {"url":f"data:image/png;base64,{self.image}"}}], } if self.role == "tool" and self.name: # include tool name if present base["name"] = self.name return base @dataclass class Session: messages: list[Message] model: str = "gpt-4.1" def to_dict(self) -> dict: return { "messages": [message.to_dict() for message in self.messages], "model": self.model } def messages_dict(self) -> list: return [message.to_dict() for message in self.messages]