Add initial implementation of AI agent with mouse and keyboard control features
This commit is contained in:
		
							
								
								
									
										16
									
								
								.vscode/launch.json
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								.vscode/launch.json
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@@ -0,0 +1,16 @@
 | 
				
			|||||||
 | 
					{
 | 
				
			||||||
 | 
					    // Use IntelliSense to learn about possible attributes.
 | 
				
			||||||
 | 
					    // Hover to view descriptions of existing attributes.
 | 
				
			||||||
 | 
					    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
 | 
				
			||||||
 | 
					    "version": "0.2.0",
 | 
				
			||||||
 | 
					    "configurations": [
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					            "name": "Python Debugger: Current File",
 | 
				
			||||||
 | 
					            "type": "debugpy",
 | 
				
			||||||
 | 
					            "request": "launch",
 | 
				
			||||||
 | 
					            "program": "main.py",
 | 
				
			||||||
 | 
					            "console": "integratedTerminal"
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
							
								
								
									
										29
									
								
								ai/compute.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										29
									
								
								ai/compute.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,29 @@
 | 
				
			|||||||
 | 
					import pyautogui
 | 
				
			||||||
 | 
					from objects.inputs import MouseInput, KeyboardInput, ButtonType
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def press_mouse(mouse_input: MouseInput) -> None:
 | 
				
			||||||
 | 
					    """Presses mouse buttons at the given position."""
 | 
				
			||||||
 | 
					    x, y = mouse_input.x, mouse_input.y
 | 
				
			||||||
 | 
					    button = mouse_input.click_type
 | 
				
			||||||
 | 
					    if button == ButtonType.LEFT:
 | 
				
			||||||
 | 
					        pyautogui.click(x, y, button='left')
 | 
				
			||||||
 | 
					    elif button == ButtonType.DOUBLE_LEFT:
 | 
				
			||||||
 | 
					        pyautogui.doubleClick(x, y)
 | 
				
			||||||
 | 
					    elif button == ButtonType.RIGHT:
 | 
				
			||||||
 | 
					        pyautogui.click(x, y, button='right')
 | 
				
			||||||
 | 
					    elif button == ButtonType.MIDDLE:
 | 
				
			||||||
 | 
					        pyautogui.click(x, y, button='middle')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def press_keyboard(keyboard_input: KeyboardInput) -> None:
 | 
				
			||||||
 | 
					    """Types the given sequence of keys."""
 | 
				
			||||||
 | 
					    text = keyboard_input.text
 | 
				
			||||||
 | 
					    if text:
 | 
				
			||||||
 | 
					        pyautogui.typewrite(text)
 | 
				
			||||||
 | 
					    if keyboard_input.press_enter:
 | 
				
			||||||
 | 
					        pyautogui.press('enter')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _execute(name, args):
 | 
				
			||||||
 | 
					    if name == "click_button":
 | 
				
			||||||
 | 
					        press_mouse(MouseInput(**args))
 | 
				
			||||||
 | 
					    elif name == "type_text":
 | 
				
			||||||
 | 
					        press_keyboard(KeyboardInput(**args))
 | 
				
			||||||
							
								
								
									
										73
									
								
								ai/processor.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										73
									
								
								ai/processor.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,73 @@
 | 
				
			|||||||
 | 
					import traceback
 | 
				
			||||||
 | 
					import json                                           # new
 | 
				
			||||||
 | 
					import openai
 | 
				
			||||||
 | 
					from objects import aic
 | 
				
			||||||
 | 
					import ai.compute
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class AIProcessor:
 | 
				
			||||||
 | 
					    def __init__(self, api_key: str, model: str = "gpt-4.1"):
 | 
				
			||||||
 | 
					        self.oai = openai.Client(api_key=api_key)
 | 
				
			||||||
 | 
					        self.model = model
 | 
				
			||||||
 | 
					        self.session = aic.Session(messages=[aic.Message(role="system", content=aic.SYSTEM_PROMPT)], model=model) # type: ignore
 | 
				
			||||||
 | 
					        self._tools_map = {                           # local binding of python callables
 | 
				
			||||||
 | 
					            "click_button": self._click_button,
 | 
				
			||||||
 | 
					            "type_text": self._type_text,
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # --------------------- tool implementations --------------------- #
 | 
				
			||||||
 | 
					    def _click_button(self, x: int, y: int, click_type: str) -> str:
 | 
				
			||||||
 | 
					        # TODO: integrate real mouse automation.
 | 
				
			||||||
 | 
					        return f"Performed {click_type} click at ({x}, {y})."
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _type_text(self, text: str) -> str:
 | 
				
			||||||
 | 
					        # TODO: integrate real typing automation.
 | 
				
			||||||
 | 
					        return f'Typed text: "{text}"'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _execute_tool(self, name: str, arguments: dict) -> str:
 | 
				
			||||||
 | 
					        func = self._tools_map.get(name)
 | 
				
			||||||
 | 
					        if not func:
 | 
				
			||||||
 | 
					            return f"Unknown tool: {name}"
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            return func(**arguments)
 | 
				
			||||||
 | 
					        except Exception as e:
 | 
				
			||||||
 | 
					            traceback.print_exc()
 | 
				
			||||||
 | 
					            return f"Error executing {name}: {e}"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # -------------------------- main entry -------------------------- #
 | 
				
			||||||
 | 
					    def process(self, prompt: str, img_data: str | bytes | None = None) -> str | list[dict]:
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            self.session.messages.append(
 | 
				
			||||||
 | 
					                aic.Message(role="user", content=prompt, image=img_data)
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            response = self.oai.chat.completions.create(
 | 
				
			||||||
 | 
					                model=self.model,
 | 
				
			||||||
 | 
					                messages=self.session.messages_dict(),
 | 
				
			||||||
 | 
					                tools=aic.FUNCTIONS,  # type: ignore
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # return tool call requests if any
 | 
				
			||||||
 | 
					            tool_calls = getattr(response.choices[0].message, "tool_calls", None)
 | 
				
			||||||
 | 
					            if tool_calls:
 | 
				
			||||||
 | 
					                for tc in tool_calls:
 | 
				
			||||||
 | 
					                    ai.compute._execute(
 | 
				
			||||||
 | 
					                        name=tc.function.name,
 | 
				
			||||||
 | 
					                        args=json.loads(tc.function.arguments)
 | 
				
			||||||
 | 
					                    )
 | 
				
			||||||
 | 
					                return [
 | 
				
			||||||
 | 
					                    {
 | 
				
			||||||
 | 
					                        "name": tc.function.name,
 | 
				
			||||||
 | 
					                        "arguments": json.loads(tc.function.arguments),
 | 
				
			||||||
 | 
					                    }
 | 
				
			||||||
 | 
					                    for tc in tool_calls
 | 
				
			||||||
 | 
					                ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # otherwise return final assistant content
 | 
				
			||||||
 | 
					            print(f"Response: {json.dumps(response.to_dict(), indent=4)}")  # debug
 | 
				
			||||||
 | 
					            output_text: str = response.choices[0].message.content  # type: ignore
 | 
				
			||||||
 | 
					            self.session.messages.append(
 | 
				
			||||||
 | 
					                aic.Message(role="assistant", content=output_text)
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            return output_text
 | 
				
			||||||
 | 
					        except Exception as e:
 | 
				
			||||||
 | 
					            traceback.print_exc()
 | 
				
			||||||
 | 
					            return f"Error processing request: {str(e)}"
 | 
				
			||||||
							
								
								
									
										18
									
								
								main.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								main.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,18 @@
 | 
				
			|||||||
 | 
					from dotenv import load_dotenv
 | 
				
			||||||
 | 
					import os
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import ai.processor
 | 
				
			||||||
 | 
					import webserver.web
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					load_dotenv()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def main():
 | 
				
			||||||
 | 
					    aip = ai.processor.AIProcessor(
 | 
				
			||||||
 | 
					        api_key=os.getenv("OPENAI_API_KEY", ""),
 | 
				
			||||||
 | 
					        model=os.getenv("OPENAI_MODEL", "gpt-4.1")
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    server = webserver.web.WebServerApp(aip)
 | 
				
			||||||
 | 
					    server.run()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if __name__ == "__main__":
 | 
				
			||||||
 | 
					    main()
 | 
				
			||||||
							
								
								
									
										97
									
								
								objects/aic.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										97
									
								
								objects/aic.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,97 @@
 | 
				
			|||||||
 | 
					from dataclasses import dataclass
 | 
				
			||||||
 | 
					from typing import Optional, Union
 | 
				
			||||||
 | 
					from typing_extensions import Literal
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					compatible_models = ("gpt-4.1", "gpt-4.1-mini", 'o4-mini', 'gpt-4.1-nano')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					SYSTEM_PROMPT = """
 | 
				
			||||||
 | 
					You are CopeAI Windows Agent. You are currently controlling a Windows 11 machine. \
 | 
				
			||||||
 | 
					You are capable to see the screen, click buttons, type text, and interact with the system. \
 | 
				
			||||||
 | 
					You will use the functions provided. The resolution of the machine is 1920x1080. \
 | 
				
			||||||
 | 
					Your text response must indicate what you are doing."""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					FUNCTIONS = [
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					        "type": "function",
 | 
				
			||||||
 | 
					        "function": {
 | 
				
			||||||
 | 
					            "name": "click_button",
 | 
				
			||||||
 | 
					            "description": "Click a button at the specified (x, y) position with the given click type.",
 | 
				
			||||||
 | 
					            "parameters": {
 | 
				
			||||||
 | 
					                "type": "object",
 | 
				
			||||||
 | 
					                "properties": {
 | 
				
			||||||
 | 
					                    "x": {
 | 
				
			||||||
 | 
					                        "type": "integer",
 | 
				
			||||||
 | 
					                        "description": "The X coordinate of the button."
 | 
				
			||||||
 | 
					                    },
 | 
				
			||||||
 | 
					                    "y": {
 | 
				
			||||||
 | 
					                        "type": "integer",
 | 
				
			||||||
 | 
					                        "description": "The Y coordinate of the button."
 | 
				
			||||||
 | 
					                    },
 | 
				
			||||||
 | 
					                    "click_type": {
 | 
				
			||||||
 | 
					                        "type": "string",
 | 
				
			||||||
 | 
					                        "enum": ["left", "double_left", "middle", "right"],
 | 
				
			||||||
 | 
					                        "description": "The type of mouse click to perform."
 | 
				
			||||||
 | 
					                    }
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                "required": ["click_type", "x", "y"],
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    },
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					        "type": "function",
 | 
				
			||||||
 | 
					        "function": {
 | 
				
			||||||
 | 
					            "name": "type_text",
 | 
				
			||||||
 | 
					            "description": "Type the given text at the current cursor location.",
 | 
				
			||||||
 | 
					            "parameters": {
 | 
				
			||||||
 | 
					                "type": "object",
 | 
				
			||||||
 | 
					                "properties": {
 | 
				
			||||||
 | 
					                    "text": {
 | 
				
			||||||
 | 
					                        "type": "string",
 | 
				
			||||||
 | 
					                        "description": "The text to type."
 | 
				
			||||||
 | 
					                    },
 | 
				
			||||||
 | 
					                    "press_enter": {
 | 
				
			||||||
 | 
					                        "type": "boolean",
 | 
				
			||||||
 | 
					                        "default": False,
 | 
				
			||||||
 | 
					                        "description": "Whether to press Enter after typing the text."
 | 
				
			||||||
 | 
					                    }
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                "required": ["text", "press_enter"],
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@dataclass
 | 
				
			||||||
 | 
					class Message:
 | 
				
			||||||
 | 
					    role: Literal['user', 'assistant', 'system', 'tool']       # + tool
 | 
				
			||||||
 | 
					    content: str
 | 
				
			||||||
 | 
					    image: Optional[Union[str, bytes]] = None
 | 
				
			||||||
 | 
					    disable_image: bool = False
 | 
				
			||||||
 | 
					    name: Optional[str] = None                                 # new – only for tool messages
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def to_dict(self) -> dict:
 | 
				
			||||||
 | 
					        base = {
 | 
				
			||||||
 | 
					            "role": self.role,
 | 
				
			||||||
 | 
					            "content": self.content
 | 
				
			||||||
 | 
					                      if (not self.image and not self.disable_image)
 | 
				
			||||||
 | 
					                      else [{"type": "text", "text": self.content},
 | 
				
			||||||
 | 
					                            {"type": "image_url", "image_url": {"url":f"data:image/png;base64,{self.image}"}}],
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					        if self.role == "tool" and self.name:                 # include tool name if present
 | 
				
			||||||
 | 
					            base["name"] = self.name
 | 
				
			||||||
 | 
					        return base
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@dataclass
 | 
				
			||||||
 | 
					class Session:
 | 
				
			||||||
 | 
					    messages: list[Message]
 | 
				
			||||||
 | 
					    model: str = "gpt-4.1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def to_dict(self) -> dict:
 | 
				
			||||||
 | 
					        return {
 | 
				
			||||||
 | 
					            "messages": [message.to_dict() for message in self.messages],
 | 
				
			||||||
 | 
					            "model": self.model
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def messages_dict(self) -> list:
 | 
				
			||||||
 | 
					        return [message.to_dict() for message in self.messages]
 | 
				
			||||||
							
								
								
									
										20
									
								
								objects/inputs.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								objects/inputs.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,20 @@
 | 
				
			|||||||
 | 
					from dataclasses import dataclass
 | 
				
			||||||
 | 
					from enum import Enum
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class ButtonType(Enum):
 | 
				
			||||||
 | 
					    LEFT = "left"
 | 
				
			||||||
 | 
					    DOUBLE_LEFT = "double_left"
 | 
				
			||||||
 | 
					    RIGHT = "right"
 | 
				
			||||||
 | 
					    MIDDLE = "middle"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@dataclass
 | 
				
			||||||
 | 
					class KeyboardInput:
 | 
				
			||||||
 | 
					    text: str
 | 
				
			||||||
 | 
					    press_enter: bool = False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@dataclass
 | 
				
			||||||
 | 
					class MouseInput:
 | 
				
			||||||
 | 
					    x: int
 | 
				
			||||||
 | 
					    y: int
 | 
				
			||||||
 | 
					    click_type: list[ButtonType]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
							
								
								
									
										7
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										7
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,7 @@
 | 
				
			|||||||
 | 
					openai
 | 
				
			||||||
 | 
					requests
 | 
				
			||||||
 | 
					flask
 | 
				
			||||||
 | 
					python-dotenv
 | 
				
			||||||
 | 
					# libraries to control mouse+keyboard+see screen
 | 
				
			||||||
 | 
					pyautogui
 | 
				
			||||||
 | 
					pynput
 | 
				
			||||||
							
								
								
									
										60
									
								
								webserver/web.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										60
									
								
								webserver/web.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,60 @@
 | 
				
			|||||||
 | 
					from flask import Flask, request, jsonify
 | 
				
			||||||
 | 
					import os, ai.processor
 | 
				
			||||||
 | 
					from dotenv import load_dotenv
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					load_dotenv()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class WebServerApp:
 | 
				
			||||||
 | 
					    def __init__(self, aip):
 | 
				
			||||||
 | 
					        self.app = Flask(__name__)
 | 
				
			||||||
 | 
					        self._register_routes()
 | 
				
			||||||
 | 
					        self.aip: ai.processor.AIProcessor = aip
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _register_routes(self):
 | 
				
			||||||
 | 
					        @self.app.route('/api/test')
 | 
				
			||||||
 | 
					        def test():
 | 
				
			||||||
 | 
					            return jsonify({"message": "Hello, World!"})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        @self.app.route('/api/request', methods=['POST'])
 | 
				
			||||||
 | 
					        def handle_request():
 | 
				
			||||||
 | 
					            # sent as form-data
 | 
				
			||||||
 | 
					            data = request.form.to_dict()
 | 
				
			||||||
 | 
					            if not data:
 | 
				
			||||||
 | 
					                return jsonify({"error": "No data provided"}), 400
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					            # Process the data as needed
 | 
				
			||||||
 | 
					            prompt = data.get('prompt', '')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            if not prompt:
 | 
				
			||||||
 | 
					                return jsonify({"error": "No prompt provided"}), 400
 | 
				
			||||||
 | 
					            img_data = None
 | 
				
			||||||
 | 
					            if 'img' in request.files:
 | 
				
			||||||
 | 
					                img_file = request.files['img']
 | 
				
			||||||
 | 
					                if img_file:
 | 
				
			||||||
 | 
					                    img_data = img_file.read()
 | 
				
			||||||
 | 
					                else:
 | 
				
			||||||
 | 
					                    img_data = None
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					            import base64
 | 
				
			||||||
 | 
					            # Convert image data to base64 if provided
 | 
				
			||||||
 | 
					            if img_data and isinstance(img_data, bytes):
 | 
				
			||||||
 | 
					                img_data = base64.b64encode(img_data).decode('utf-8')
 | 
				
			||||||
 | 
					            elif img_data and isinstance(img_data, str):
 | 
				
			||||||
 | 
					                img_data = img_data.encode('utf-8')                
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					            response = self.aip.process(prompt, img_data)
 | 
				
			||||||
 | 
					            return jsonify({"response": response}), 200
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        @self.app.route('/api/health')
 | 
				
			||||||
 | 
					        def health():
 | 
				
			||||||
 | 
					            return jsonify({"status": "healthy"})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def run(self, *args, **kwargs):
 | 
				
			||||||
 | 
					        self.app.run(*args, **kwargs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Example usage:
 | 
				
			||||||
 | 
					# if __name__ == "__main__":
 | 
				
			||||||
 | 
					#     server = WebServerApp()
 | 
				
			||||||
 | 
					#     server.run()
 | 
				
			||||||
		Reference in New Issue
	
	Block a user