feat: wip: give OCR+positions

2025-05-19 16:10:02 +02:00
parent 5be7f9aadb
commit 105ab4a04b
1 changed files with 12 additions and 18 deletions
--- a/ai/processor.py
+++ b/ai/processor.py
@@ -51,17 +51,6 @@ class AIProcessor:
                aic.Message(role="user", content=prompt, image=img_data)
            )
            # if image provided, perform OCR and include text positions
            if img_data is not None:
                # decode base64 if needed
                try:
                    img_bytes = base64.b64decode(img_data) if isinstance(img_data, str) else img_data
                    ocr_results = ai.compute.perform_ocr(img_bytes)
                    # append OCR results as a tool message
                    self.session.messages.append(
                        aic.Message(role="tool", name="ocr", content=json.dumps(ocr_results))
                    )
                except Exception as e:
                    logger.debug("OCR failed: %s", e)
            response = self.oai.chat.completions.create(
                model=self.model,
                messages=self.session.messages_dict(),
@@ -113,6 +102,17 @@ class AIProcessor:
                    "Shortened message copies for processing: %s", cps
                )
                if reexec:
                    img_bytes = ai.compute.take_screenshot(cross_position=click_positions)
                    img = ai.compute.screenshot_to_base64(
                        img_bytes
                    )
                    ocr_results = []
                    try:
                        ocr_results = ai.compute.perform_ocr(img_bytes)
                    except Exception as e:
                        logger.debug("OCR failed: %s", e)
                    self.session.messages.append(
                        aic.Message(
                            role="assistant",
@@ -120,13 +120,7 @@ class AIProcessor:
                        )
                    )
-                    img = ai.compute.screenshot_to_base64(
+                    outputs.extend( self.process(nextsteps+f"\nOCR Positions: {ocr_results}", img) )
                        ai.compute.take_screenshot(cross_position=click_positions)
                    )
                    outputs.extend( self.process(nextsteps, img) )
                return [
                    {
                        "name": tc.function.name,