gpt-agent/ai/compute.py

import pyautogui
import threading
import time, io, base64
import sys
# Try PyQt5 for transparent, click-through overlay
try:
    from PyQt5 import QtWidgets, QtCore, QtGui
    PYQT_AVAILABLE = True
except ImportError:
import tkinter as tk
import sys
if sys.platform == 'darwin':
    try:
        from AppKit import NSApplication, NSPanel, NSColor, NSBezierPath, NSView, NSWindowStyleMaskBorderless, NSBackingStoreBuffered, NSStatusWindowLevel, NSWindowCollectionBehaviorCanJoinAllSpaces
        COCOA_AVAILABLE = True
    except ImportError:
        COCOA_AVAILABLE = False
else:
    COCOA_AVAILABLE = False
    PYQT_AVAILABLE = False
from objects.inputs import MouseInput, KeyboardInput, ButtonType
from PIL import ImageGrab  # type: ignore

def take_screenshot() -> bytes:
    """Take a screenshot of the current screen and return it as bytes."""
    screenshot = ImageGrab.grab()
    buf = io.BytesIO()
    screenshot.save(buf, format='PNG')
    return buf.getvalue()

def screenshot_to_base64(screenshot: bytes) -> str:
    """Convert screenshot bytes to a base64 encoded string."""
    return base64.b64encode(screenshot).decode('utf-8')

def show_click_indicator(x: int, y: int, duration: float = 2.0, size: int = 50) -> None:
    # """Display a red circle at (x, y) for the given duration."""
    # """Try PyQt5 overlay first, else fallback to tkinter."""
    # half = size // 2
    # if PYQT_AVAILABLE:
    #     # Setup QApplication
    #     app = QtWidgets.QApplication.instance() or QtWidgets.QApplication(sys.argv)
    #     # Frameless, transparent, click-through window
    #     w = QtWidgets.QWidget()
    #     flags = QtCore.Qt.FramelessWindowHint | QtCore.Qt.WindowStaysOnTopHint | QtCore.Qt.Tool
    #     w.setWindowFlags(flags)
    #     w.setAttribute(QtCore.Qt.WA_TranslucentBackground)
    #     w.setAttribute(QtCore.Qt.WA_TransparentForMouseEvents)
    #     w.setGeometry(x-half, y-half, size, size)
    #     # Draw circle
    #     pixmap = QtGui.QPixmap(size, size)
    #     pixmap.fill(QtCore.Qt.transparent)
    #     painter = QtGui.QPainter(pixmap)
    #     pen = QtGui.QPen(QtGui.QColor('red'), 4)
    #     painter.setPen(pen)
    #     painter.drawEllipse(2, 2, size-4, size-4)
    #     painter.end()
    #     label = QtWidgets.QLabel(w)
    #     label.setPixmap(pixmap)
    #     w.show()
    #     app.processEvents()
    #     time.sleep(duration)
    #     w.close()
    # else:
    #     # Fallback tkinter overlay (may intercept clicks)
    #     root = tk.Tk()
    #     root.overrideredirect(True)
    #     root.attributes('-topmost', True)
    #     root.attributes('-alpha', 0.5)
    #     root.geometry(f"{size}x{size}+{x-half}+{y-half}")
    #     canvas = tk.Canvas(root, width=size, height=size, highlightthickness=0, bg='white')
    #     canvas.pack()
    #     canvas.create_oval(2, 2, size-2, size-2, outline='red', width=4)
    #     root.update()
    #     time.sleep(duration)
    #     root.destroy()
    pass

def press_mouse(mouse_input: MouseInput) -> None:
    """Presses mouse buttons at the given position."""
    x, y = mouse_input.x, mouse_input.y
    button = mouse_input.click_type
    if button == "left":
        pyautogui.click(x, y, button='left')
    elif button == "double_left":
        pyautogui.doubleClick(x, y)
    elif button == "right":
        pyautogui.click(x, y, button='right')
    elif button == "middle":
        pyautogui.click(x, y, button='middle')
    # Show red circle indicator at click position for 2 seconds
    threading.Thread(target=show_click_indicator, args=(x, y), daemon=True).start()

def press_keyboard(keyboard_input: KeyboardInput) -> None:
    """Types the given sequence of keys."""
    text = keyboard_input.text
    if text:
        pyautogui.typewrite(text)
    if keyboard_input.press_enter:
        pyautogui.press('enter')

def wait(duration: float) -> None:
    """Waits for the specified duration in seconds."""
    time.sleep(duration)

def reprompt(nextsteps: str, processor) -> None:
    """Re-execute GPT and take a new screenshot."""
    scr = screenshot_to_base64(take_screenshot())
    return processor.process(nextsteps, img_data=scr)


def _execute(name, args, processor):
    if name == "click_button":
        press_mouse(MouseInput(**args))
    elif name == "type_text":
        press_keyboard(KeyboardInput(**args))
    elif name == "wait":
        wait(**args)
    elif name == "reprompt":
        reprompt(**args, processor=processor)