gpt-agent/objects/aic.py

from dataclasses import dataclass
from typing import Optional, Union
from typing_extensions import Literal

compatible_models = ("gpt-4.1", "gpt-4.1-mini", 'o4-mini', 'gpt-4.1-nano')

SYSTEM_PROMPT = """
You are CopeAI Windows Agent. You are currently controlling a Windows 11 machine. \
You are capable to see the screen, click buttons, type text, and interact with the system. \
You will use the functions provided. The resolution of the machine is 1920x1080. \
Your text response must indicate what you are doing. If the place where you clicked seems incorrect, \
you will use everything you can to find the position of the location of the goal and click again. You will see a red cross on where you previously clicked."""

FUNCTIONS = [
    {
        "type": "function",
        "function": {
            "name": "click_button",
            "description": "Click a button at the specified (x, y) position with the given click type.",
            "parameters": {
                "type": "object",
                "properties": {
                    "x": {
                        "type": "integer",
                        "description": "The X coordinate of the button."
                    },
                    "y": {
                        "type": "integer",
                        "description": "The Y coordinate of the button."
                    },
                    "click_type": {
                        "type": "string",
                        "enum": ["left", "double_left", "middle", "right"],
                        "description": "The type of mouse click to perform. `double_left` is a double click."
                    }
                },
                "required": ["click_type", "x", "y"],
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "type_text",
            "description": "Type the given text at the current cursor location.",
            "parameters": {
                "type": "object",
                "properties": {
                    "text": {
                        "type": "string",
                        "description": "The text to type."
                    },
                    "press_enter": {
                        "type": "boolean",
                        "default": False,
                        "description": "Whether to press Enter after typing the text."
                    }
                },
                "required": ["text", "press_enter"],
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "wait",
            "description": "Wait for a specified amount of time.",
            "parameters": {
                "type": "object",
                "properties": {
                    "duration": {
                        "type": "number",
                        "description": "The duration to wait in seconds."
                    }
                },
                "required": ["duration"],
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "reprompt",
            "description": "After doing what you had to do, re-execute once again with a new screenshot.",
            "parameters": {
                "type": "object",
                "properties": {
                    "nextsteps": {
                        "type": "string",
                        "description": "The new steps to perform."
                    }
                },
                "required": ["nextsteps"],
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "confirm",
            "description": "Confirm that the task is completed and no further actions are needed. ONLY execute this when you fulfilled the user's request. This can be the only function called.",
            "parameters": {
                "type": "object",
                "properties": {
                    "goal": {
                        "type": "string",
                        "description": "The goal that was achieved."
                    }
                },
                "required": ["goal"],
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "search_pc",
            "description": "Open the start menu, then searches for content. Use to open apps, open files/folders, or search the web. Use this in priority!!!",
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {
                        "type": "string",
                        "description": "The search query to perform."
                    }
                },
                "required": ["query"],
            }
        }
    },
]


@dataclass
class Message:
    role: Literal['user', 'assistant', 'system', 'tool']       # + tool
    content: str
    image: Optional[Union[str, bytes]] = None
    disable_image: bool = False
    name: Optional[str] = None                                 # new – only for tool messages

    def to_dict(self) -> dict:
        base = {
            "role": self.role,
            "content": self.content
                      if (not self.image and not self.disable_image)
                      else [{"type": "text", "text": self.content},
                            {"type": "image_url", "image_url": {"url":f"data:image/png;base64,{self.image}"}}],
        }
        if self.role == "tool" and self.name:                 # include tool name if present
            base["name"] = self.name
        return base

@dataclass
class Session:
    messages: list[Message]
    model: str = "gpt-4.1"

    def to_dict(self) -> dict:
        return {
            "messages": [message.to_dict() for message in self.messages],
            "model": self.model
        }

    def messages_dict(self) -> list:
        return [message.to_dict() for message in self.messages]