From eaf098a7157bcb5d8097a67e8a5b6b35b62fc520 Mon Sep 17 00:00:00 2001 From: Austin Cameron Date: Sat, 9 Aug 2025 15:11:39 -0400 Subject: [PATCH] Add support for GPT-5 models with OCR functionality - Introduced handling for new models (`gpt-5-with-ocr`, `gpt-5-research-with-ocr`, `gpt-5-mini-with-ocr`, and `gpt-5-nano-with-ocr`) in `apis.py`. - Updated `config.py` to include the new models in validation. - Changed default model in `main.py` to `gpt-5-with-ocr`. - Updated `requirements.txt` for greater flexibility with dependency versions. - Bumped the version in `setup.py` to `1.6.0`. --- operate/config.py | 6 +- operate/main.py | 2 +- operate/models/apis.py | 461 +++++++++++++++++++++++++++++++++++++++++ requirements.txt | 6 +- setup.py | 2 +- 5 files changed, 471 insertions(+), 6 deletions(-) diff --git a/operate/config.py b/operate/config.py index 09f78da0..d1c87625 100644 --- a/operate/config.py +++ b/operate/config.py @@ -140,7 +140,11 @@ def validation(self, model, voice_mode): or model == "gpt-4-with-som" or model == "gpt-4-with-ocr" or model == "gpt-4.1-with-ocr" - or model == "o1-with-ocr", + or model == "o1-with-ocr" + or model == "gpt-5-with-ocr" + or model == "gpt-5-research-with-ocr" + or model == "gpt-5-mini-with-ocr" + or model == "gpt-5-nano-with-ocr", ) self.require_api_key( "GOOGLE_API_KEY", "Google API key", model == "gemini-pro-vision" diff --git a/operate/main.py b/operate/main.py index 86832e4e..d2473cda 100644 --- a/operate/main.py +++ b/operate/main.py @@ -15,7 +15,7 @@ def main_entry(): "--model", help="Specify the model to use", required=False, - default="gpt-4-with-ocr", + default="gpt-5-with-ocr", ) # Add a voice flag diff --git a/operate/models/apis.py b/operate/models/apis.py index 23794fca..839d3f8b 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -52,6 +52,18 @@ async def get_next_action(model, messages, objective, session_id): if model == "o1-with-ocr": operation = await call_o1_with_ocr(messages, objective, model) return operation, None + if model == "gpt-5-with-ocr": + operation = await call_gpt_5_with_ocr(messages, objective, model) + return operation, None + if model == "gpt-5-research-with-ocr": + operation = await call_gpt_5_research_with_ocr(messages, objective, model) + return operation, None + if model == "gpt-5-mini-with-ocr": + operation = await call_gpt_5_mini_with_ocr(messages, objective, model) + return operation, None + if model == "gpt-5-nano-with-ocr": + operation = await call_gpt_5_nano_with_ocr(messages, objective, model) + return operation, None if model == "agent-1": return "coming soon" if model == "gemini-pro-vision": @@ -643,6 +655,455 @@ async def call_o1_with_ocr(messages, objective, model): return gpt_4_fallback(messages, objective, model) +async def call_gpt_5_with_ocr(messages, objective, model): + if config.verbose: + print("[call_gpt_5_with_ocr]") + + # Construct the path to the file within the package + try: + time.sleep(1) + client = config.initialize_openai() + + confirm_system_prompt(messages, objective, model) + screenshots_dir = "screenshots" + if not os.path.exists(screenshots_dir): + os.makedirs(screenshots_dir) + + screenshot_filename = os.path.join(screenshots_dir, "screenshot.png") + # Call the function to capture the screen with the cursor + capture_screen_with_cursor(screenshot_filename) + + with open(screenshot_filename, "rb") as img_file: + img_base64 = base64.b64encode(img_file.read()).decode("utf-8") + + if len(messages) == 1: + user_prompt = get_user_first_message_prompt() + else: + user_prompt = get_user_prompt() + + vision_message = { + "role": "user", + "content": [ + {"type": "text", "text": user_prompt}, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}, + }, + ], + } + messages.append(vision_message) + + response = client.chat.completions.create( + model="gpt-5", + messages=messages, + ) + + content = response.choices[0].message.content + + content = clean_json(content) + + content_str = content + + content = json.loads(content) + + processed_content = [] + + for operation in content: + if operation.get("operation") == "click": + text_to_click = operation.get("text") + if config.verbose: + print( + "[call_gpt_5_with_ocr][click] text_to_click", + text_to_click, + ) + # Initialize EasyOCR Reader + reader = easyocr.Reader(["en"]) + + # Read the screenshot + result = reader.readtext(screenshot_filename) + + text_element_index = get_text_element( + result, text_to_click, screenshot_filename + ) + coordinates = get_text_coordinates( + result, text_element_index, screenshot_filename + ) + + # add `coordinates`` to `content` + operation["x"] = coordinates["x"] + operation["y"] = coordinates["y"] + + if config.verbose: + print( + "[call_gpt_5_with_ocr][click] text_element_index", + text_element_index, + ) + print( + "[call_gpt_5_with_ocr][click] coordinates", + coordinates, + ) + print( + "[call_gpt_5_with_ocr][click] final operation", + operation, + ) + processed_content.append(operation) + + else: + processed_content.append(operation) + + # wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history + assistant_message = {"role": "assistant", "content": content_str} + messages.append(assistant_message) + + return processed_content + + except Exception as e: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying another method {ANSI_RESET}" + ) + if config.verbose: + print("[Self-Operating Computer][Operate] error", e) + traceback.print_exc() + return gpt_4_fallback(messages, objective, model) + + +async def call_gpt_5_nano_with_ocr(messages, objective, model): + if config.verbose: + print("[call_gpt_5_nano_with_ocr]") + + # Construct the path to the file within the package + try: + time.sleep(1) + client = config.initialize_openai() + + confirm_system_prompt(messages, objective, model) + screenshots_dir = "screenshots" + if not os.path.exists(screenshots_dir): + os.makedirs(screenshots_dir) + + screenshot_filename = os.path.join(screenshots_dir, "screenshot.png") + # Call the function to capture the screen with the cursor + capture_screen_with_cursor(screenshot_filename) + + with open(screenshot_filename, "rb") as img_file: + img_base64 = base64.b64encode(img_file.read()).decode("utf-8") + + if len(messages) == 1: + user_prompt = get_user_first_message_prompt() + else: + user_prompt = get_user_prompt() + + vision_message = { + "role": "user", + "content": [ + {"type": "text", "text": user_prompt}, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}, + }, + ], + } + messages.append(vision_message) + + response = client.chat.completions.create( + model="gpt-5-nano", + messages=messages, + ) + + content = response.choices[0].message.content + + content = clean_json(content) + + content_str = content + + content = json.loads(content) + + processed_content = [] + + for operation in content: + if operation.get("operation") == "click": + text_to_click = operation.get("text") + if config.verbose: + print( + "[call_gpt_5_nano_with_ocr][click] text_to_click", + text_to_click, + ) + # Initialize EasyOCR Reader + reader = easyocr.Reader(["en"]) + + # Read the screenshot + result = reader.readtext(screenshot_filename) + + text_element_index = get_text_element( + result, text_to_click, screenshot_filename + ) + coordinates = get_text_coordinates( + result, text_element_index, screenshot_filename + ) + + # add `coordinates`` to `content` + operation["x"] = coordinates["x"] + operation["y"] = coordinates["y"] + + if config.verbose: + print( + "[call_gpt_5_nano_with_ocr][click] text_element_index", + text_element_index, + ) + print( + "[call_gpt_5_nano_with_ocr][click] coordinates", + coordinates, + ) + print( + "[call_gpt_5_nano_with_ocr][click] final operation", + operation, + ) + processed_content.append(operation) + + else: + processed_content.append(operation) + + # wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history + assistant_message = {"role": "assistant", "content": content_str} + messages.append(assistant_message) + + return processed_content + + except Exception as e: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying another method {ANSI_RESET}" + ) + if config.verbose: + print("[Self-Operating Computer][Operate] error", e) + traceback.print_exc() + return gpt_4_fallback(messages, objective, model) + + +async def call_gpt_5_research_with_ocr(messages, objective, model): + if config.verbose: + print("[call_gpt_5_research_with_ocr]") + + # Construct the path to the file within the package + try: + time.sleep(1) + client = config.initialize_openai() + + confirm_system_prompt(messages, objective, model) + screenshots_dir = "screenshots" + if not os.path.exists(screenshots_dir): + os.makedirs(screenshots_dir) + + screenshot_filename = os.path.join(screenshots_dir, "screenshot.png") + # Call the function to capture the screen with the cursor + capture_screen_with_cursor(screenshot_filename) + + with open(screenshot_filename, "rb") as img_file: + img_base64 = base64.b64encode(img_file.read()).decode("utf-8") + + if len(messages) == 1: + user_prompt = get_user_first_message_prompt() + else: + user_prompt = get_user_prompt() + + vision_message = { + "role": "user", + "content": [ + {"type": "text", "text": user_prompt}, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}, + }, + ], + } + messages.append(vision_message) + + response = client.chat.completions.create( + model="gpt-5", + messages=messages, + reasoning="thorough", + ) + + content = response.choices[0].message.content + + content = clean_json(content) + + content_str = content + + content = json.loads(content) + + processed_content = [] + + for operation in content: + if operation.get("operation") == "click": + text_to_click = operation.get("text") + if config.verbose: + print( + "[call_gpt_5_research_with_ocr][click] text_to_click", + text_to_click, + ) + # Initialize EasyOCR Reader + reader = easyocr.Reader(["en"]) + + # Read the screenshot + result = reader.readtext(screenshot_filename) + + text_element_index = get_text_element( + result, text_to_click, screenshot_filename + ) + coordinates = get_text_coordinates( + result, text_element_index, screenshot_filename + ) + + # add `coordinates`` to `content` + operation["x"] = coordinates["x"] + operation["y"] = coordinates["y"] + + if config.verbose: + print( + "[call_gpt_5_research_with_ocr][click] text_element_index", + text_element_index, + ) + print( + "[call_gpt_5_research_with_ocr][click] coordinates", + coordinates, + ) + print( + "[call_gpt_5_research_with_ocr][click] final operation", + operation, + ) + processed_content.append(operation) + + else: + processed_content.append(operation) + + # wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history + assistant_message = {"role": "assistant", "content": content_str} + messages.append(assistant_message) + + return processed_content + + except Exception as e: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying another method {ANSI_RESET}" + ) + if config.verbose: + print("[Self-Operating Computer][Operate] error", e) + traceback.print_exc() + return gpt_4_fallback(messages, objective, model) + + +async def call_gpt_5_mini_with_ocr(messages, objective, model): + if config.verbose: + print("[call_gpt_5_mini_with_ocr]") + + # Construct the path to the file within the package + try: + time.sleep(1) + client = config.initialize_openai() + + confirm_system_prompt(messages, objective, model) + screenshots_dir = "screenshots" + if not os.path.exists(screenshots_dir): + os.makedirs(screenshots_dir) + + screenshot_filename = os.path.join(screenshots_dir, "screenshot.png") + # Call the function to capture the screen with the cursor + capture_screen_with_cursor(screenshot_filename) + + with open(screenshot_filename, "rb") as img_file: + img_base64 = base64.b64encode(img_file.read()).decode("utf-8") + + if len(messages) == 1: + user_prompt = get_user_first_message_prompt() + else: + user_prompt = get_user_prompt() + + vision_message = { + "role": "user", + "content": [ + {"type": "text", "text": user_prompt}, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}, + }, + ], + } + messages.append(vision_message) + + response = client.chat.completions.create( + model="gpt-5-mini", + messages=messages, + ) + + content = response.choices[0].message.content + + content = clean_json(content) + + content_str = content + + content = json.loads(content) + + processed_content = [] + + for operation in content: + if operation.get("operation") == "click": + text_to_click = operation.get("text") + if config.verbose: + print( + "[call_gpt_5_mini_with_ocr][click] text_to_click", + text_to_click, + ) + # Initialize EasyOCR Reader + reader = easyocr.Reader(["en"]) + + # Read the screenshot + result = reader.readtext(screenshot_filename) + + text_element_index = get_text_element( + result, text_to_click, screenshot_filename + ) + coordinates = get_text_coordinates( + result, text_element_index, screenshot_filename + ) + + # add `coordinates`` to `content` + operation["x"] = coordinates["x"] + operation["y"] = coordinates["y"] + + if config.verbose: + print( + "[call_gpt_5_mini_with_ocr][click] text_element_index", + text_element_index, + ) + print( + "[call_gpt_5_mini_with_ocr][click] coordinates", + coordinates, + ) + print( + "[call_gpt_5_mini_with_ocr][click] final operation", + operation, + ) + processed_content.append(operation) + + else: + processed_content.append(operation) + + # wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history + assistant_message = {"role": "assistant", "content": content_str} + messages.append(assistant_message) + + return processed_content + + except Exception as e: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying another method {ANSI_RESET}" + ) + if config.verbose: + print("[Self-Operating Computer][Operate] error", e) + traceback.print_exc() + return gpt_4_fallback(messages, objective, model) + + async def call_gpt_4o_labeled(messages, objective, model): time.sleep(1) diff --git a/requirements.txt b/requirements.txt index c7a646be..5b8c2f71 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,8 +19,8 @@ kiwisolver==1.4.5 matplotlib==3.8.1 MouseInfo==0.1.3 mss==9.0.1 -numpy==1.26.1 -openai==1.2.3 +numpy>=1.26.1 +openai>=1.50.0 packaging==23.2 Pillow==10.1.0 prompt-toolkit==3.0.39 @@ -43,7 +43,7 @@ rubicon-objc==0.4.7 six==1.16.0 sniffio==1.3.0 tqdm==4.66.1 -typing_extensions==4.8.0 +typing_extensions>=4.11 urllib3==2.0.7 wcwidth==0.2.9 zipp==3.17.0 diff --git a/setup.py b/setup.py index 6cdf6163..53d7cefe 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name="self-operating-computer", - version="1.5.8", + version="1.6.0", packages=find_packages(), install_requires=required, # Add dependencies here entry_points={