feat: Add OCR functionality to process method; integrate Tesseract for text extraction from screenshots
This commit is contained in:
parent
20764d5d19
commit
5be7f9aadb
@ -1,5 +1,6 @@
|
||||
import pyautogui
|
||||
import threading
|
||||
import pytesseract
|
||||
import time, io, base64
|
||||
import sys
|
||||
from objects.inputs import MouseInput, KeyboardInput, ButtonType
|
||||
@ -28,6 +29,28 @@ def take_screenshot(cross_position: list[tuple[int, int]] | None = None) -> byte
|
||||
screenshot.save("screenshot.png", format='PNG')
|
||||
return buf.getvalue()
|
||||
|
||||
def perform_ocr(screenshot: bytes) -> list[dict]:
|
||||
"""Perform OCR on screenshot bytes and return list of text blocks with positions."""
|
||||
from PIL import Image # type: ignore
|
||||
import io
|
||||
# open image from bytes
|
||||
img = Image.open(io.BytesIO(screenshot))
|
||||
# perform OCR, get data dictionary
|
||||
data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
|
||||
results = []
|
||||
n = len(data.get('level', []))
|
||||
for i in range(n):
|
||||
text = data['text'][i]
|
||||
if text and text.strip():
|
||||
results.append({
|
||||
'text': text,
|
||||
'left': data['left'][i],
|
||||
'top': data['top'][i],
|
||||
'width': data['width'][i],
|
||||
'height': data['height'][i]
|
||||
})
|
||||
return results
|
||||
|
||||
def screenshot_to_base64(screenshot: bytes) -> str:
|
||||
"""Convert screenshot bytes to a base64 encoded string."""
|
||||
return base64.b64encode(screenshot).decode('utf-8')
|
||||
|
@ -1,6 +1,7 @@
|
||||
import traceback
|
||||
import json
|
||||
import openai
|
||||
import base64
|
||||
from flask import jsonify
|
||||
from objects import aic
|
||||
import ai.compute
|
||||
@ -45,9 +46,22 @@ class AIProcessor:
|
||||
click_positions = [] # used for screenshot crosshair position
|
||||
nextsteps = ""
|
||||
try:
|
||||
# append user prompt with optional image
|
||||
self.session.messages.append(
|
||||
aic.Message(role="user", content=prompt, image=img_data)
|
||||
)
|
||||
# if image provided, perform OCR and include text positions
|
||||
if img_data is not None:
|
||||
# decode base64 if needed
|
||||
try:
|
||||
img_bytes = base64.b64decode(img_data) if isinstance(img_data, str) else img_data
|
||||
ocr_results = ai.compute.perform_ocr(img_bytes)
|
||||
# append OCR results as a tool message
|
||||
self.session.messages.append(
|
||||
aic.Message(role="tool", name="ocr", content=json.dumps(ocr_results))
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug("OCR failed: %s", e)
|
||||
response = self.oai.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=self.session.messages_dict(),
|
||||
|
@ -6,5 +6,6 @@ python-dotenv
|
||||
pyautogui
|
||||
pynput
|
||||
pillow
|
||||
pytesseract
|
||||
|
||||
# --index-url https://mirrors.sustech.edu.cn/pypi/simple
|
||||
|
Loading…
Reference in New Issue
Block a user