Add initial implementation of AI agent with mouse and keyboard control features

2025-05-19 00:48:14 +02:00
parent ed34ebca6a
commit 7e612c1af7
8 changed files with 320 additions and 0 deletions
--- a/ai/compute.py
+++ b/ai/compute.py
@@ -0,0 +1,29 @@
+import pyautogui
+from objects.inputs import MouseInput, KeyboardInput, ButtonType
+
+def press_mouse(mouse_input: MouseInput) -> None:
+    """Presses mouse buttons at the given position."""
+    x, y = mouse_input.x, mouse_input.y
+    button = mouse_input.click_type
+    if button == ButtonType.LEFT:
+        pyautogui.click(x, y, button='left')
+    elif button == ButtonType.DOUBLE_LEFT:
+        pyautogui.doubleClick(x, y)
+    elif button == ButtonType.RIGHT:
+        pyautogui.click(x, y, button='right')
+    elif button == ButtonType.MIDDLE:
+        pyautogui.click(x, y, button='middle')
+
+def press_keyboard(keyboard_input: KeyboardInput) -> None:
+    """Types the given sequence of keys."""
+    text = keyboard_input.text
+    if text:
+        pyautogui.typewrite(text)
+    if keyboard_input.press_enter:
+        pyautogui.press('enter')
+
+def _execute(name, args):
+    if name == "click_button":
+        press_mouse(MouseInput(**args))
+    elif name == "type_text":
+        press_keyboard(KeyboardInput(**args))
--- a/ai/processor.py
+++ b/ai/processor.py
@@ -0,0 +1,73 @@
+import traceback
+import json                                           # new
+import openai
+from objects import aic
+import ai.compute
+
+class AIProcessor:
+    def __init__(self, api_key: str, model: str = "gpt-4.1"):
+        self.oai = openai.Client(api_key=api_key)
+        self.model = model
+        self.session = aic.Session(messages=[aic.Message(role="system", content=aic.SYSTEM_PROMPT)], model=model) # type: ignore
+        self._tools_map = {                           # local binding of python callables
+            "click_button": self._click_button,
+            "type_text": self._type_text,
+        }
+
+    # --------------------- tool implementations --------------------- #
+    def _click_button(self, x: int, y: int, click_type: str) -> str:
+        # TODO: integrate real mouse automation.
+        return f"Performed {click_type} click at ({x}, {y})."
+
+    def _type_text(self, text: str) -> str:
+        # TODO: integrate real typing automation.
+        return f'Typed text: "{text}"'
+
+    def _execute_tool(self, name: str, arguments: dict) -> str:
+        func = self._tools_map.get(name)
+        if not func:
+            return f"Unknown tool: {name}"
+        try:
+            return func(**arguments)
+        except Exception as e:
+            traceback.print_exc()
+            return f"Error executing {name}: {e}"
+
+    # -------------------------- main entry -------------------------- #
+    def process(self, prompt: str, img_data: str | bytes | None = None) -> str | list[dict]:
+        try:
+            self.session.messages.append(
+                aic.Message(role="user", content=prompt, image=img_data)
+            )
+            response = self.oai.chat.completions.create(
+                model=self.model,
+                messages=self.session.messages_dict(),
+                tools=aic.FUNCTIONS,  # type: ignore
+            )
+
+            # return tool call requests if any
+            tool_calls = getattr(response.choices[0].message, "tool_calls", None)
+            if tool_calls:
+                for tc in tool_calls:
+                    ai.compute._execute(
+                        name=tc.function.name,
+                        args=json.loads(tc.function.arguments)
+                    )
+                return [
+                    {
+                        "name": tc.function.name,
+                        "arguments": json.loads(tc.function.arguments),
+                    }
+                    for tc in tool_calls
+                ]
+
+            # otherwise return final assistant content
+            print(f"Response: {json.dumps(response.to_dict(), indent=4)}")  # debug
+            output_text: str = response.choices[0].message.content  # type: ignore
+            self.session.messages.append(
+                aic.Message(role="assistant", content=output_text)
+            )
+            return output_text
+        except Exception as e:
+            traceback.print_exc()
+            return f"Error processing request: {str(e)}"