feat: wip: give OCR+positions

2025-05-19 16:10:02 +02:00
parent 5be7f9aadb
commit 105ab4a04b
1 changed files with 12 additions and 18 deletions
--- a/ai/processor.py
+++ b/ai/processor.py
@@ -51,17 +51,6 @@ class AIProcessor:
                aic.Message(role="user", content=prompt, image=img_data)
            )
            # if image provided, perform OCR and include text positions
-            if img_data is not None:
-                # decode base64 if needed
-                try:
-                    img_bytes = base64.b64decode(img_data) if isinstance(img_data, str) else img_data
-                    ocr_results = ai.compute.perform_ocr(img_bytes)
-                    # append OCR results as a tool message
-                    self.session.messages.append(
-                        aic.Message(role="tool", name="ocr", content=json.dumps(ocr_results))
-                    )
-                except Exception as e:
-                    logger.debug("OCR failed: %s", e)
            response = self.oai.chat.completions.create(
                model=self.model,
                messages=self.session.messages_dict(),
@@ -113,6 +102,17 @@ class AIProcessor:
                    "Shortened message copies for processing: %s", cps
                )
                if reexec:
+                    img_bytes = ai.compute.take_screenshot(cross_position=click_positions)
+                    img = ai.compute.screenshot_to_base64(
+                        img_bytes
+                    )
+
+                    ocr_results = []
+                    try:
+                        ocr_results = ai.compute.perform_ocr(img_bytes)
+                    except Exception as e:
+                        logger.debug("OCR failed: %s", e)
+
                    self.session.messages.append(
                        aic.Message(
                            role="assistant",
@@ -120,13 +120,7 @@ class AIProcessor:
                        )
                    )

-                    img = ai.compute.screenshot_to_base64(
-                        ai.compute.take_screenshot(cross_position=click_positions)
-                    )
-
-
-
-                    outputs.extend( self.process(nextsteps, img) )
+                    outputs.extend( self.process(nextsteps+f"\nOCR Positions: {ocr_results}", img) )
                return [
                    {
                        "name": tc.function.name,