diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..15f5e60 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,16 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + + { + "name": "Python Debugger: Current File", + "type": "debugpy", + "request": "launch", + "program": "main.py", + "console": "integratedTerminal" + } + ] +} diff --git a/ai/compute.py b/ai/compute.py new file mode 100644 index 0000000..46033aa --- /dev/null +++ b/ai/compute.py @@ -0,0 +1,29 @@ +import pyautogui +from objects.inputs import MouseInput, KeyboardInput, ButtonType + +def press_mouse(mouse_input: MouseInput) -> None: + """Presses mouse buttons at the given position.""" + x, y = mouse_input.x, mouse_input.y + button = mouse_input.click_type + if button == ButtonType.LEFT: + pyautogui.click(x, y, button='left') + elif button == ButtonType.DOUBLE_LEFT: + pyautogui.doubleClick(x, y) + elif button == ButtonType.RIGHT: + pyautogui.click(x, y, button='right') + elif button == ButtonType.MIDDLE: + pyautogui.click(x, y, button='middle') + +def press_keyboard(keyboard_input: KeyboardInput) -> None: + """Types the given sequence of keys.""" + text = keyboard_input.text + if text: + pyautogui.typewrite(text) + if keyboard_input.press_enter: + pyautogui.press('enter') + +def _execute(name, args): + if name == "click_button": + press_mouse(MouseInput(**args)) + elif name == "type_text": + press_keyboard(KeyboardInput(**args)) diff --git a/ai/processor.py b/ai/processor.py new file mode 100644 index 0000000..e90d548 --- /dev/null +++ b/ai/processor.py @@ -0,0 +1,73 @@ +import traceback +import json # new +import openai +from objects import aic +import ai.compute + +class AIProcessor: + def __init__(self, api_key: str, model: str = "gpt-4.1"): + self.oai = openai.Client(api_key=api_key) + self.model = model + self.session = aic.Session(messages=[aic.Message(role="system", content=aic.SYSTEM_PROMPT)], model=model) # type: ignore + self._tools_map = { # local binding of python callables + "click_button": self._click_button, + "type_text": self._type_text, + } + + # --------------------- tool implementations --------------------- # + def _click_button(self, x: int, y: int, click_type: str) -> str: + # TODO: integrate real mouse automation. + return f"Performed {click_type} click at ({x}, {y})." + + def _type_text(self, text: str) -> str: + # TODO: integrate real typing automation. + return f'Typed text: "{text}"' + + def _execute_tool(self, name: str, arguments: dict) -> str: + func = self._tools_map.get(name) + if not func: + return f"Unknown tool: {name}" + try: + return func(**arguments) + except Exception as e: + traceback.print_exc() + return f"Error executing {name}: {e}" + + # -------------------------- main entry -------------------------- # + def process(self, prompt: str, img_data: str | bytes | None = None) -> str | list[dict]: + try: + self.session.messages.append( + aic.Message(role="user", content=prompt, image=img_data) + ) + response = self.oai.chat.completions.create( + model=self.model, + messages=self.session.messages_dict(), + tools=aic.FUNCTIONS, # type: ignore + ) + + # return tool call requests if any + tool_calls = getattr(response.choices[0].message, "tool_calls", None) + if tool_calls: + for tc in tool_calls: + ai.compute._execute( + name=tc.function.name, + args=json.loads(tc.function.arguments) + ) + return [ + { + "name": tc.function.name, + "arguments": json.loads(tc.function.arguments), + } + for tc in tool_calls + ] + + # otherwise return final assistant content + print(f"Response: {json.dumps(response.to_dict(), indent=4)}") # debug + output_text: str = response.choices[0].message.content # type: ignore + self.session.messages.append( + aic.Message(role="assistant", content=output_text) + ) + return output_text + except Exception as e: + traceback.print_exc() + return f"Error processing request: {str(e)}" diff --git a/main.py b/main.py new file mode 100644 index 0000000..6626e80 --- /dev/null +++ b/main.py @@ -0,0 +1,18 @@ +from dotenv import load_dotenv +import os + +import ai.processor +import webserver.web + +load_dotenv() + +def main(): + aip = ai.processor.AIProcessor( + api_key=os.getenv("OPENAI_API_KEY", ""), + model=os.getenv("OPENAI_MODEL", "gpt-4.1") + ) + server = webserver.web.WebServerApp(aip) + server.run() + +if __name__ == "__main__": + main() diff --git a/objects/aic.py b/objects/aic.py new file mode 100644 index 0000000..eddd02c --- /dev/null +++ b/objects/aic.py @@ -0,0 +1,97 @@ +from dataclasses import dataclass +from typing import Optional, Union +from typing_extensions import Literal + +compatible_models = ("gpt-4.1", "gpt-4.1-mini", 'o4-mini', 'gpt-4.1-nano') + +SYSTEM_PROMPT = """ +You are CopeAI Windows Agent. You are currently controlling a Windows 11 machine. \ +You are capable to see the screen, click buttons, type text, and interact with the system. \ +You will use the functions provided. The resolution of the machine is 1920x1080. \ +Your text response must indicate what you are doing.""" + +FUNCTIONS = [ + { + "type": "function", + "function": { + "name": "click_button", + "description": "Click a button at the specified (x, y) position with the given click type.", + "parameters": { + "type": "object", + "properties": { + "x": { + "type": "integer", + "description": "The X coordinate of the button." + }, + "y": { + "type": "integer", + "description": "The Y coordinate of the button." + }, + "click_type": { + "type": "string", + "enum": ["left", "double_left", "middle", "right"], + "description": "The type of mouse click to perform." + } + }, + "required": ["click_type", "x", "y"], + } + } + }, + { + "type": "function", + "function": { + "name": "type_text", + "description": "Type the given text at the current cursor location.", + "parameters": { + "type": "object", + "properties": { + "text": { + "type": "string", + "description": "The text to type." + }, + "press_enter": { + "type": "boolean", + "default": False, + "description": "Whether to press Enter after typing the text." + } + }, + "required": ["text", "press_enter"], + } + } + } +] + + +@dataclass +class Message: + role: Literal['user', 'assistant', 'system', 'tool'] # + tool + content: str + image: Optional[Union[str, bytes]] = None + disable_image: bool = False + name: Optional[str] = None # new – only for tool messages + + def to_dict(self) -> dict: + base = { + "role": self.role, + "content": self.content + if (not self.image and not self.disable_image) + else [{"type": "text", "text": self.content}, + {"type": "image_url", "image_url": {"url":f"data:image/png;base64,{self.image}"}}], + } + if self.role == "tool" and self.name: # include tool name if present + base["name"] = self.name + return base + +@dataclass +class Session: + messages: list[Message] + model: str = "gpt-4.1" + + def to_dict(self) -> dict: + return { + "messages": [message.to_dict() for message in self.messages], + "model": self.model + } + + def messages_dict(self) -> list: + return [message.to_dict() for message in self.messages] diff --git a/objects/inputs.py b/objects/inputs.py new file mode 100644 index 0000000..1d090f3 --- /dev/null +++ b/objects/inputs.py @@ -0,0 +1,20 @@ +from dataclasses import dataclass +from enum import Enum + +class ButtonType(Enum): + LEFT = "left" + DOUBLE_LEFT = "double_left" + RIGHT = "right" + MIDDLE = "middle" + +@dataclass +class KeyboardInput: + text: str + press_enter: bool = False + +@dataclass +class MouseInput: + x: int + y: int + click_type: list[ButtonType] + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..df725ad --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +openai +requests +flask +python-dotenv +# libraries to control mouse+keyboard+see screen +pyautogui +pynput diff --git a/webserver/web.py b/webserver/web.py new file mode 100644 index 0000000..865ab45 --- /dev/null +++ b/webserver/web.py @@ -0,0 +1,60 @@ +from flask import Flask, request, jsonify +import os, ai.processor +from dotenv import load_dotenv + +load_dotenv() + +class WebServerApp: + def __init__(self, aip): + self.app = Flask(__name__) + self._register_routes() + self.aip: ai.processor.AIProcessor = aip + + def _register_routes(self): + @self.app.route('/api/test') + def test(): + return jsonify({"message": "Hello, World!"}) + + @self.app.route('/api/request', methods=['POST']) + def handle_request(): + # sent as form-data + data = request.form.to_dict() + if not data: + return jsonify({"error": "No data provided"}), 400 + + # Process the data as needed + prompt = data.get('prompt', '') + + + + if not prompt: + return jsonify({"error": "No prompt provided"}), 400 + img_data = None + if 'img' in request.files: + img_file = request.files['img'] + if img_file: + img_data = img_file.read() + else: + img_data = None + + import base64 + # Convert image data to base64 if provided + if img_data and isinstance(img_data, bytes): + img_data = base64.b64encode(img_data).decode('utf-8') + elif img_data and isinstance(img_data, str): + img_data = img_data.encode('utf-8') + + response = self.aip.process(prompt, img_data) + return jsonify({"response": response}), 200 + + @self.app.route('/api/health') + def health(): + return jsonify({"status": "healthy"}) + + def run(self, *args, **kwargs): + self.app.run(*args, **kwargs) + +# Example usage: +# if __name__ == "__main__": +# server = WebServerApp() +# server.run()