import pyautogui import threading import pytesseract import time, io, base64 import sys from objects.inputs import MouseInput, KeyboardInput, ButtonType from PIL import ImageGrab, ImageDraw # type: ignore def take_screenshot(cross_position: list[tuple[int, int]] | None = None) -> bytes: """Take a screenshot of the current screen and return it as bytes.""" screenshot = ImageGrab.grab() buf = io.BytesIO() # Optionally draw a crosshair at the specified position if cross_position: for pos in cross_position: x, y = pos draw = ImageDraw.Draw(screenshot) size = 20 # half‐length of each arm color = (255, 0, 0) width = 2 # horizontal line draw.line((x - size, y, x + size, y), fill=color, width=width) # vertical line draw.line((x, y - size, x, y + size), fill=color, width=width) screenshot.save(buf, format='PNG') # save in a file screenshot.save("screenshot.png", format='PNG') return buf.getvalue() def perform_ocr(screenshot: bytes) -> list[dict]: """Perform OCR on screenshot bytes and return list of text blocks with positions.""" from PIL import Image # type: ignore import io # open image from bytes img = Image.open(io.BytesIO(screenshot)) # perform OCR, get data dictionary data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT) results = [] n = len(data.get('level', [])) for i in range(n): text = data['text'][i] if text and text.strip(): results.append({ 'text': text, 'left': data['left'][i], 'top': data['top'][i], 'width': data['width'][i], 'height': data['height'][i] }) return results def screenshot_to_base64(screenshot: bytes) -> str: """Convert screenshot bytes to a base64 encoded string.""" return base64.b64encode(screenshot).decode('utf-8') def show_click_indicator(x: int, y: int, duration: float = 2.0, size: int = 50) -> None: """Display a red circle at (x, y) for the given duration, can be clicked through.""" pass def press_mouse(mouse_input: MouseInput) -> None: """Presses mouse buttons at the given position.""" x, y = mouse_input.x, mouse_input.y button = mouse_input.click_type if button == "left": pyautogui.click(x, y, button='left') elif button == "double_left": pyautogui.doubleClick(x, y) elif button == "right": pyautogui.click(x, y, button='right') elif button == "middle": pyautogui.click(x, y, button='middle') # Show red circle indicator at click position for 2 seconds threading.Thread(target=show_click_indicator, args=(x, y), daemon=True).start() def press_keyboard(keyboard_input: KeyboardInput) -> None: """Types the given sequence of keys.""" text = keyboard_input.text if text: pyautogui.typewrite(text) if keyboard_input.press_enter: pyautogui.press('enter') def wait(duration: float) -> None: """Waits for the specified duration in seconds.""" time.sleep(duration) def windows_key() -> None: """Presses the Windows key.""" pyautogui.hotkey('win') def reprompt(nextsteps: str, processor) -> None: """Re-execute GPT and take a new screenshot.""" scr = screenshot_to_base64(take_screenshot()) return processor.process(nextsteps, img_data=scr) def _execute(name, args=[], processor=None): if name == "click_button": press_mouse(MouseInput(**args)) elif name == "type_text": press_keyboard(KeyboardInput(**args)) elif name == "wait": wait(**args) elif name == "windows_key": windows_key() elif name == "reprompt": reprompt(**args, processor=processor)