From 5be7f9aadba9ab3c31793f5cf3e9a8e42f371514 Mon Sep 17 00:00:00 2001 From: Showdown76py Date: Mon, 19 May 2025 15:59:46 +0200 Subject: [PATCH] feat: Add OCR functionality to process method; integrate Tesseract for text extraction from screenshots --- ai/compute.py | 23 +++++++++++++++++++++++ ai/processor.py | 14 ++++++++++++++ requirements.txt | 1 + 3 files changed, 38 insertions(+) diff --git a/ai/compute.py b/ai/compute.py index b9fe727..6a2e3c5 100644 --- a/ai/compute.py +++ b/ai/compute.py @@ -1,5 +1,6 @@ import pyautogui import threading +import pytesseract import time, io, base64 import sys from objects.inputs import MouseInput, KeyboardInput, ButtonType @@ -28,6 +29,28 @@ def take_screenshot(cross_position: list[tuple[int, int]] | None = None) -> byte screenshot.save("screenshot.png", format='PNG') return buf.getvalue() +def perform_ocr(screenshot: bytes) -> list[dict]: + """Perform OCR on screenshot bytes and return list of text blocks with positions.""" + from PIL import Image # type: ignore + import io + # open image from bytes + img = Image.open(io.BytesIO(screenshot)) + # perform OCR, get data dictionary + data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT) + results = [] + n = len(data.get('level', [])) + for i in range(n): + text = data['text'][i] + if text and text.strip(): + results.append({ + 'text': text, + 'left': data['left'][i], + 'top': data['top'][i], + 'width': data['width'][i], + 'height': data['height'][i] + }) + return results + def screenshot_to_base64(screenshot: bytes) -> str: """Convert screenshot bytes to a base64 encoded string.""" return base64.b64encode(screenshot).decode('utf-8') diff --git a/ai/processor.py b/ai/processor.py index 4eee89a..50c4888 100644 --- a/ai/processor.py +++ b/ai/processor.py @@ -1,6 +1,7 @@ import traceback import json import openai +import base64 from flask import jsonify from objects import aic import ai.compute @@ -45,9 +46,22 @@ class AIProcessor: click_positions = [] # used for screenshot crosshair position nextsteps = "" try: + # append user prompt with optional image self.session.messages.append( aic.Message(role="user", content=prompt, image=img_data) ) + # if image provided, perform OCR and include text positions + if img_data is not None: + # decode base64 if needed + try: + img_bytes = base64.b64decode(img_data) if isinstance(img_data, str) else img_data + ocr_results = ai.compute.perform_ocr(img_bytes) + # append OCR results as a tool message + self.session.messages.append( + aic.Message(role="tool", name="ocr", content=json.dumps(ocr_results)) + ) + except Exception as e: + logger.debug("OCR failed: %s", e) response = self.oai.chat.completions.create( model=self.model, messages=self.session.messages_dict(), diff --git a/requirements.txt b/requirements.txt index dca86e2..2ad1bdd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,5 +6,6 @@ python-dotenv pyautogui pynput pillow + pytesseract # --index-url https://mirrors.sustech.edu.cn/pypi/simple