feat: wip: give OCR+positions

This commit is contained in:
Showdown76 2025-05-19 16:10:02 +02:00
parent 5be7f9aadb
commit 105ab4a04b

View File

@ -51,17 +51,6 @@ class AIProcessor:
aic.Message(role="user", content=prompt, image=img_data)
)
# if image provided, perform OCR and include text positions
if img_data is not None:
# decode base64 if needed
try:
img_bytes = base64.b64decode(img_data) if isinstance(img_data, str) else img_data
ocr_results = ai.compute.perform_ocr(img_bytes)
# append OCR results as a tool message
self.session.messages.append(
aic.Message(role="tool", name="ocr", content=json.dumps(ocr_results))
)
except Exception as e:
logger.debug("OCR failed: %s", e)
response = self.oai.chat.completions.create(
model=self.model,
messages=self.session.messages_dict(),
@ -113,6 +102,17 @@ class AIProcessor:
"Shortened message copies for processing: %s", cps
)
if reexec:
img_bytes = ai.compute.take_screenshot(cross_position=click_positions)
img = ai.compute.screenshot_to_base64(
img_bytes
)
ocr_results = []
try:
ocr_results = ai.compute.perform_ocr(img_bytes)
except Exception as e:
logger.debug("OCR failed: %s", e)
self.session.messages.append(
aic.Message(
role="assistant",
@ -120,13 +120,7 @@ class AIProcessor:
)
)
img = ai.compute.screenshot_to_base64(
ai.compute.take_screenshot(cross_position=click_positions)
)
outputs.extend( self.process(nextsteps, img) )
outputs.extend( self.process(nextsteps+f"\nOCR Positions: {ocr_results}", img) )
return [
{
"name": tc.function.name,