167 lines
5.8 KiB
Python
167 lines
5.8 KiB
Python
from dataclasses import dataclass
|
||
from typing import Optional, Union
|
||
from typing_extensions import Literal
|
||
|
||
compatible_models = ("gpt-4.1", "gpt-4.1-mini", 'o4-mini', 'gpt-4.1-nano')
|
||
|
||
SYSTEM_PROMPT = """
|
||
You are CopeAI Windows Agent. You are currently controlling a Windows 11 machine. \
|
||
You are capable to see the screen, click buttons, type text, and interact with the system. \
|
||
You will use the functions provided. The resolution of the machine is 1920x1080. \
|
||
Your text response must indicate what you are doing. If the place where you clicked seems incorrect, \
|
||
you will use everything you can to find the position of the location of the goal and click again. You will see a red cross on where you previously clicked."""
|
||
|
||
FUNCTIONS = [
|
||
{
|
||
"type": "function",
|
||
"function": {
|
||
"name": "click_button",
|
||
"description": "Click a button at the specified (x, y) position with the given click type.",
|
||
"parameters": {
|
||
"type": "object",
|
||
"properties": {
|
||
"x": {
|
||
"type": "integer",
|
||
"description": "The X coordinate of the button."
|
||
},
|
||
"y": {
|
||
"type": "integer",
|
||
"description": "The Y coordinate of the button."
|
||
},
|
||
"click_type": {
|
||
"type": "string",
|
||
"enum": ["left", "double_left", "middle", "right"],
|
||
"description": "The type of mouse click to perform. `double_left` is a double click."
|
||
}
|
||
},
|
||
"required": ["click_type", "x", "y"],
|
||
}
|
||
}
|
||
},
|
||
{
|
||
"type": "function",
|
||
"function": {
|
||
"name": "type_text",
|
||
"description": "Type the given text at the current cursor location.",
|
||
"parameters": {
|
||
"type": "object",
|
||
"properties": {
|
||
"text": {
|
||
"type": "string",
|
||
"description": "The text to type."
|
||
},
|
||
"press_enter": {
|
||
"type": "boolean",
|
||
"default": False,
|
||
"description": "Whether to press Enter after typing the text."
|
||
}
|
||
},
|
||
"required": ["text", "press_enter"],
|
||
}
|
||
}
|
||
},
|
||
{
|
||
"type": "function",
|
||
"function": {
|
||
"name": "wait",
|
||
"description": "Wait for a specified amount of time.",
|
||
"parameters": {
|
||
"type": "object",
|
||
"properties": {
|
||
"duration": {
|
||
"type": "number",
|
||
"description": "The duration to wait in seconds."
|
||
}
|
||
},
|
||
"required": ["duration"],
|
||
}
|
||
}
|
||
},
|
||
{
|
||
"type": "function",
|
||
"function": {
|
||
"name": "reprompt",
|
||
"description": "After doing what you had to do, re-execute once again with a new screenshot.",
|
||
"parameters": {
|
||
"type": "object",
|
||
"properties": {
|
||
"nextsteps": {
|
||
"type": "string",
|
||
"description": "The new steps to perform."
|
||
}
|
||
},
|
||
"required": ["nextsteps"],
|
||
}
|
||
}
|
||
},
|
||
{
|
||
"type": "function",
|
||
"function": {
|
||
"name": "confirm",
|
||
"description": "Confirm that the task is completed and no further actions are needed. ONLY execute this when you fulfilled the user's request. This can be the only function called.",
|
||
"parameters": {
|
||
"type": "object",
|
||
"properties": {
|
||
"goal": {
|
||
"type": "string",
|
||
"description": "The goal that was achieved."
|
||
}
|
||
},
|
||
"required": ["goal"],
|
||
}
|
||
}
|
||
},
|
||
{
|
||
"type": "function",
|
||
"function": {
|
||
"name": "search_pc",
|
||
"description": "Open the start menu, then searches for content. Use to open apps, open files/folders, or search the web. Use this in priority!!!",
|
||
"parameters": {
|
||
"type": "object",
|
||
"properties": {
|
||
"query": {
|
||
"type": "string",
|
||
"description": "The search query to perform."
|
||
}
|
||
},
|
||
"required": ["query"],
|
||
}
|
||
}
|
||
},
|
||
]
|
||
|
||
|
||
@dataclass
|
||
class Message:
|
||
role: Literal['user', 'assistant', 'system', 'tool'] # + tool
|
||
content: str
|
||
image: Optional[Union[str, bytes]] = None
|
||
disable_image: bool = False
|
||
name: Optional[str] = None # new – only for tool messages
|
||
|
||
def to_dict(self) -> dict:
|
||
base = {
|
||
"role": self.role,
|
||
"content": self.content
|
||
if (not self.image and not self.disable_image)
|
||
else [{"type": "text", "text": self.content},
|
||
{"type": "image_url", "image_url": {"url":f"data:image/png;base64,{self.image}"}}],
|
||
}
|
||
if self.role == "tool" and self.name: # include tool name if present
|
||
base["name"] = self.name
|
||
return base
|
||
|
||
@dataclass
|
||
class Session:
|
||
messages: list[Message]
|
||
model: str = "gpt-4.1"
|
||
|
||
def to_dict(self) -> dict:
|
||
return {
|
||
"messages": [message.to_dict() for message in self.messages],
|
||
"model": self.model
|
||
}
|
||
|
||
def messages_dict(self) -> list:
|
||
return [message.to_dict() for message in self.messages]
|