feat: wip: give OCR+positions
This commit is contained in:
parent
5be7f9aadb
commit
105ab4a04b
@ -51,17 +51,6 @@ class AIProcessor:
|
|||||||
aic.Message(role="user", content=prompt, image=img_data)
|
aic.Message(role="user", content=prompt, image=img_data)
|
||||||
)
|
)
|
||||||
# if image provided, perform OCR and include text positions
|
# if image provided, perform OCR and include text positions
|
||||||
if img_data is not None:
|
|
||||||
# decode base64 if needed
|
|
||||||
try:
|
|
||||||
img_bytes = base64.b64decode(img_data) if isinstance(img_data, str) else img_data
|
|
||||||
ocr_results = ai.compute.perform_ocr(img_bytes)
|
|
||||||
# append OCR results as a tool message
|
|
||||||
self.session.messages.append(
|
|
||||||
aic.Message(role="tool", name="ocr", content=json.dumps(ocr_results))
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug("OCR failed: %s", e)
|
|
||||||
response = self.oai.chat.completions.create(
|
response = self.oai.chat.completions.create(
|
||||||
model=self.model,
|
model=self.model,
|
||||||
messages=self.session.messages_dict(),
|
messages=self.session.messages_dict(),
|
||||||
@ -113,6 +102,17 @@ class AIProcessor:
|
|||||||
"Shortened message copies for processing: %s", cps
|
"Shortened message copies for processing: %s", cps
|
||||||
)
|
)
|
||||||
if reexec:
|
if reexec:
|
||||||
|
img_bytes = ai.compute.take_screenshot(cross_position=click_positions)
|
||||||
|
img = ai.compute.screenshot_to_base64(
|
||||||
|
img_bytes
|
||||||
|
)
|
||||||
|
|
||||||
|
ocr_results = []
|
||||||
|
try:
|
||||||
|
ocr_results = ai.compute.perform_ocr(img_bytes)
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("OCR failed: %s", e)
|
||||||
|
|
||||||
self.session.messages.append(
|
self.session.messages.append(
|
||||||
aic.Message(
|
aic.Message(
|
||||||
role="assistant",
|
role="assistant",
|
||||||
@ -120,13 +120,7 @@ class AIProcessor:
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
img = ai.compute.screenshot_to_base64(
|
outputs.extend( self.process(nextsteps+f"\nOCR Positions: {ocr_results}", img) )
|
||||||
ai.compute.take_screenshot(cross_position=click_positions)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
outputs.extend( self.process(nextsteps, img) )
|
|
||||||
return [
|
return [
|
||||||
{
|
{
|
||||||
"name": tc.function.name,
|
"name": tc.function.name,
|
||||||
|
Loading…
Reference in New Issue
Block a user