gpt-agent/objects/aic.py
2025-05-19 17:19:24 +02:00

167 lines
5.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from dataclasses import dataclass
from typing import Optional, Union
from typing_extensions import Literal
compatible_models = ("gpt-4.1", "gpt-4.1-mini", 'o4-mini', 'gpt-4.1-nano')
SYSTEM_PROMPT = """
You are CopeAI Windows Agent. You are currently controlling a Windows 11 machine. \
You are capable to see the screen, click buttons, type text, and interact with the system. \
You will use the functions provided. The resolution of the machine is 1920x1080. \
Your text response must indicate what you are doing. If the place where you clicked seems incorrect, \
you will use everything you can to find the position of the location of the goal and click again. You will see a red cross on where you previously clicked."""
FUNCTIONS = [
{
"type": "function",
"function": {
"name": "click_button",
"description": "Click a button at the specified (x, y) position with the given click type.",
"parameters": {
"type": "object",
"properties": {
"x": {
"type": "integer",
"description": "The X coordinate of the button."
},
"y": {
"type": "integer",
"description": "The Y coordinate of the button."
},
"click_type": {
"type": "string",
"enum": ["left", "double_left", "middle", "right"],
"description": "The type of mouse click to perform. `double_left` is a double click."
}
},
"required": ["click_type", "x", "y"],
}
}
},
{
"type": "function",
"function": {
"name": "type_text",
"description": "Type the given text at the current cursor location.",
"parameters": {
"type": "object",
"properties": {
"text": {
"type": "string",
"description": "The text to type."
},
"press_enter": {
"type": "boolean",
"default": False,
"description": "Whether to press Enter after typing the text."
}
},
"required": ["text", "press_enter"],
}
}
},
{
"type": "function",
"function": {
"name": "wait",
"description": "Wait for a specified amount of time.",
"parameters": {
"type": "object",
"properties": {
"duration": {
"type": "number",
"description": "The duration to wait in seconds."
}
},
"required": ["duration"],
}
}
},
{
"type": "function",
"function": {
"name": "reprompt",
"description": "After doing what you had to do, re-execute once again with a new screenshot.",
"parameters": {
"type": "object",
"properties": {
"nextsteps": {
"type": "string",
"description": "The new steps to perform."
}
},
"required": ["nextsteps"],
}
}
},
{
"type": "function",
"function": {
"name": "confirm",
"description": "Confirm that the task is completed and no further actions are needed. ONLY execute this when you fulfilled the user's request. This can be the only function called.",
"parameters": {
"type": "object",
"properties": {
"goal": {
"type": "string",
"description": "The goal that was achieved."
}
},
"required": ["goal"],
}
}
},
{
"type": "function",
"function": {
"name": "search_pc",
"description": "Open the start menu, then searches for content. Use to open apps, open file explorer, or search the web. Use this in priority!!!",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "The search query to perform."
}
},
"required": ["query"],
}
}
},
]
@dataclass
class Message:
role: Literal['user', 'assistant', 'system', 'tool'] # + tool
content: str
image: Optional[Union[str, bytes]] = None
disable_image: bool = False
name: Optional[str] = None # new only for tool messages
def to_dict(self) -> dict:
base = {
"role": self.role,
"content": self.content
if (not self.image and not self.disable_image)
else [{"type": "text", "text": self.content},
{"type": "image_url", "image_url": {"url":f"data:image/png;base64,{self.image}"}}],
}
if self.role == "tool" and self.name: # include tool name if present
base["name"] = self.name
return base
@dataclass
class Session:
messages: list[Message]
model: str = "gpt-4.1"
def to_dict(self) -> dict:
return {
"messages": [message.to_dict() for message in self.messages],
"model": self.model
}
def messages_dict(self) -> list:
return [message.to_dict() for message in self.messages]