From eaf098a7157bcb5d8097a67e8a5b6b35b62fc520 Mon Sep 17 00:00:00 2001
From: Austin Cameron <me@austincameron.com>
Date: Sat, 9 Aug 2025 15:11:39 -0400
Subject: [PATCH] Add support for GPT-5 models with OCR functionality

- Introduced handling for new models (`gpt-5-with-ocr`, `gpt-5-research-with-ocr`, `gpt-5-mini-with-ocr`, and `gpt-5-nano-with-ocr`) in `apis.py`.
- Updated `config.py` to include the new models in validation.
- Changed default model in `main.py` to `gpt-5-with-ocr`.
- Updated `requirements.txt` for greater flexibility with dependency versions.
- Bumped the version in `setup.py` to `1.6.0`.
---
 operate/config.py      |   6 +-
 operate/main.py        |   2 +-
 operate/models/apis.py | 461 +++++++++++++++++++++++++++++++++++++++++
 requirements.txt       |   6 +-
 setup.py               |   2 +-
 5 files changed, 471 insertions(+), 6 deletions(-)

diff --git a/operate/config.py b/operate/config.py
index 09f78da0..d1c87625 100644
--- a/operate/config.py
+++ b/operate/config.py
@@ -140,7 +140,11 @@ def validation(self, model, voice_mode):
             or model == "gpt-4-with-som"
             or model == "gpt-4-with-ocr"
             or model == "gpt-4.1-with-ocr"
-            or model == "o1-with-ocr",
+            or model == "o1-with-ocr"
+            or model == "gpt-5-with-ocr"
+            or model == "gpt-5-research-with-ocr"
+            or model == "gpt-5-mini-with-ocr"
+            or model == "gpt-5-nano-with-ocr",
         )
         self.require_api_key(
             "GOOGLE_API_KEY", "Google API key", model == "gemini-pro-vision"
diff --git a/operate/main.py b/operate/main.py
index 86832e4e..d2473cda 100644
--- a/operate/main.py
+++ b/operate/main.py
@@ -15,7 +15,7 @@ def main_entry():
         "--model",
         help="Specify the model to use",
         required=False,
-        default="gpt-4-with-ocr",
+        default="gpt-5-with-ocr",
     )
 
     # Add a voice flag
diff --git a/operate/models/apis.py b/operate/models/apis.py
index 23794fca..839d3f8b 100644
--- a/operate/models/apis.py
+++ b/operate/models/apis.py
@@ -52,6 +52,18 @@ async def get_next_action(model, messages, objective, session_id):
     if model == "o1-with-ocr":
         operation = await call_o1_with_ocr(messages, objective, model)
         return operation, None
+    if model == "gpt-5-with-ocr":
+        operation = await call_gpt_5_with_ocr(messages, objective, model)
+        return operation, None
+    if model == "gpt-5-research-with-ocr":
+        operation = await call_gpt_5_research_with_ocr(messages, objective, model)
+        return operation, None
+    if model == "gpt-5-mini-with-ocr":
+        operation = await call_gpt_5_mini_with_ocr(messages, objective, model)
+        return operation, None
+    if model == "gpt-5-nano-with-ocr":
+        operation = await call_gpt_5_nano_with_ocr(messages, objective, model)
+        return operation, None
     if model == "agent-1":
         return "coming soon"
     if model == "gemini-pro-vision":
@@ -643,6 +655,455 @@ async def call_o1_with_ocr(messages, objective, model):
         return gpt_4_fallback(messages, objective, model)
 
 
+async def call_gpt_5_with_ocr(messages, objective, model):
+    if config.verbose:
+        print("[call_gpt_5_with_ocr]")
+
+    # Construct the path to the file within the package
+    try:
+        time.sleep(1)
+        client = config.initialize_openai()
+
+        confirm_system_prompt(messages, objective, model)
+        screenshots_dir = "screenshots"
+        if not os.path.exists(screenshots_dir):
+            os.makedirs(screenshots_dir)
+
+        screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
+        # Call the function to capture the screen with the cursor
+        capture_screen_with_cursor(screenshot_filename)
+
+        with open(screenshot_filename, "rb") as img_file:
+            img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
+
+        if len(messages) == 1:
+            user_prompt = get_user_first_message_prompt()
+        else:
+            user_prompt = get_user_prompt()
+
+        vision_message = {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": user_prompt},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
+                },
+            ],
+        }
+        messages.append(vision_message)
+
+        response = client.chat.completions.create(
+            model="gpt-5",
+            messages=messages,
+        )
+
+        content = response.choices[0].message.content
+
+        content = clean_json(content)
+
+        content_str = content
+
+        content = json.loads(content)
+
+        processed_content = []
+
+        for operation in content:
+            if operation.get("operation") == "click":
+                text_to_click = operation.get("text")
+                if config.verbose:
+                    print(
+                        "[call_gpt_5_with_ocr][click] text_to_click",
+                        text_to_click,
+                    )
+                # Initialize EasyOCR Reader
+                reader = easyocr.Reader(["en"])
+
+                # Read the screenshot
+                result = reader.readtext(screenshot_filename)
+
+                text_element_index = get_text_element(
+                    result, text_to_click, screenshot_filename
+                )
+                coordinates = get_text_coordinates(
+                    result, text_element_index, screenshot_filename
+                )
+
+                # add `coordinates`` to `content`
+                operation["x"] = coordinates["x"]
+                operation["y"] = coordinates["y"]
+
+                if config.verbose:
+                    print(
+                        "[call_gpt_5_with_ocr][click] text_element_index",
+                        text_element_index,
+                    )
+                    print(
+                        "[call_gpt_5_with_ocr][click] coordinates",
+                        coordinates,
+                    )
+                    print(
+                        "[call_gpt_5_with_ocr][click] final operation",
+                        operation,
+                    )
+                processed_content.append(operation)
+
+            else:
+                processed_content.append(operation)
+
+        # wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history
+        assistant_message = {"role": "assistant", "content": content_str}
+        messages.append(assistant_message)
+
+        return processed_content
+
+    except Exception as e:
+        print(
+            f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying another method {ANSI_RESET}"
+        )
+        if config.verbose:
+            print("[Self-Operating Computer][Operate] error", e)
+            traceback.print_exc()
+        return gpt_4_fallback(messages, objective, model)
+
+
+async def call_gpt_5_nano_with_ocr(messages, objective, model):
+    if config.verbose:
+        print("[call_gpt_5_nano_with_ocr]")
+
+    # Construct the path to the file within the package
+    try:
+        time.sleep(1)
+        client = config.initialize_openai()
+
+        confirm_system_prompt(messages, objective, model)
+        screenshots_dir = "screenshots"
+        if not os.path.exists(screenshots_dir):
+            os.makedirs(screenshots_dir)
+
+        screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
+        # Call the function to capture the screen with the cursor
+        capture_screen_with_cursor(screenshot_filename)
+
+        with open(screenshot_filename, "rb") as img_file:
+            img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
+
+        if len(messages) == 1:
+            user_prompt = get_user_first_message_prompt()
+        else:
+            user_prompt = get_user_prompt()
+
+        vision_message = {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": user_prompt},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
+                },
+            ],
+        }
+        messages.append(vision_message)
+
+        response = client.chat.completions.create(
+            model="gpt-5-nano",
+            messages=messages,
+        )
+
+        content = response.choices[0].message.content
+
+        content = clean_json(content)
+
+        content_str = content
+
+        content = json.loads(content)
+
+        processed_content = []
+
+        for operation in content:
+            if operation.get("operation") == "click":
+                text_to_click = operation.get("text")
+                if config.verbose:
+                    print(
+                        "[call_gpt_5_nano_with_ocr][click] text_to_click",
+                        text_to_click,
+                    )
+                # Initialize EasyOCR Reader
+                reader = easyocr.Reader(["en"])
+
+                # Read the screenshot
+                result = reader.readtext(screenshot_filename)
+
+                text_element_index = get_text_element(
+                    result, text_to_click, screenshot_filename
+                )
+                coordinates = get_text_coordinates(
+                    result, text_element_index, screenshot_filename
+                )
+
+                # add `coordinates`` to `content`
+                operation["x"] = coordinates["x"]
+                operation["y"] = coordinates["y"]
+
+                if config.verbose:
+                    print(
+                        "[call_gpt_5_nano_with_ocr][click] text_element_index",
+                        text_element_index,
+                    )
+                    print(
+                        "[call_gpt_5_nano_with_ocr][click] coordinates",
+                        coordinates,
+                    )
+                    print(
+                        "[call_gpt_5_nano_with_ocr][click] final operation",
+                        operation,
+                    )
+                processed_content.append(operation)
+
+            else:
+                processed_content.append(operation)
+
+        # wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history
+        assistant_message = {"role": "assistant", "content": content_str}
+        messages.append(assistant_message)
+
+        return processed_content
+
+    except Exception as e:
+        print(
+            f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying another method {ANSI_RESET}"
+        )
+        if config.verbose:
+            print("[Self-Operating Computer][Operate] error", e)
+            traceback.print_exc()
+        return gpt_4_fallback(messages, objective, model)
+
+
+async def call_gpt_5_research_with_ocr(messages, objective, model):
+    if config.verbose:
+        print("[call_gpt_5_research_with_ocr]")
+
+    # Construct the path to the file within the package
+    try:
+        time.sleep(1)
+        client = config.initialize_openai()
+
+        confirm_system_prompt(messages, objective, model)
+        screenshots_dir = "screenshots"
+        if not os.path.exists(screenshots_dir):
+            os.makedirs(screenshots_dir)
+
+        screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
+        # Call the function to capture the screen with the cursor
+        capture_screen_with_cursor(screenshot_filename)
+
+        with open(screenshot_filename, "rb") as img_file:
+            img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
+
+        if len(messages) == 1:
+            user_prompt = get_user_first_message_prompt()
+        else:
+            user_prompt = get_user_prompt()
+
+        vision_message = {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": user_prompt},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
+                },
+            ],
+        }
+        messages.append(vision_message)
+
+        response = client.chat.completions.create(
+            model="gpt-5",
+            messages=messages,
+            reasoning="thorough",
+        )
+
+        content = response.choices[0].message.content
+
+        content = clean_json(content)
+
+        content_str = content
+
+        content = json.loads(content)
+
+        processed_content = []
+
+        for operation in content:
+            if operation.get("operation") == "click":
+                text_to_click = operation.get("text")
+                if config.verbose:
+                    print(
+                        "[call_gpt_5_research_with_ocr][click] text_to_click",
+                        text_to_click,
+                    )
+                # Initialize EasyOCR Reader
+                reader = easyocr.Reader(["en"])
+
+                # Read the screenshot
+                result = reader.readtext(screenshot_filename)
+
+                text_element_index = get_text_element(
+                    result, text_to_click, screenshot_filename
+                )
+                coordinates = get_text_coordinates(
+                    result, text_element_index, screenshot_filename
+                )
+
+                # add `coordinates`` to `content`
+                operation["x"] = coordinates["x"]
+                operation["y"] = coordinates["y"]
+
+                if config.verbose:
+                    print(
+                        "[call_gpt_5_research_with_ocr][click] text_element_index",
+                        text_element_index,
+                    )
+                    print(
+                        "[call_gpt_5_research_with_ocr][click] coordinates",
+                        coordinates,
+                    )
+                    print(
+                        "[call_gpt_5_research_with_ocr][click] final operation",
+                        operation,
+                    )
+                processed_content.append(operation)
+
+            else:
+                processed_content.append(operation)
+
+        # wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history
+        assistant_message = {"role": "assistant", "content": content_str}
+        messages.append(assistant_message)
+
+        return processed_content
+
+    except Exception as e:
+        print(
+            f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying another method {ANSI_RESET}"
+        )
+        if config.verbose:
+            print("[Self-Operating Computer][Operate] error", e)
+            traceback.print_exc()
+        return gpt_4_fallback(messages, objective, model)
+
+
+async def call_gpt_5_mini_with_ocr(messages, objective, model):
+    if config.verbose:
+        print("[call_gpt_5_mini_with_ocr]")
+
+    # Construct the path to the file within the package
+    try:
+        time.sleep(1)
+        client = config.initialize_openai()
+
+        confirm_system_prompt(messages, objective, model)
+        screenshots_dir = "screenshots"
+        if not os.path.exists(screenshots_dir):
+            os.makedirs(screenshots_dir)
+
+        screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
+        # Call the function to capture the screen with the cursor
+        capture_screen_with_cursor(screenshot_filename)
+
+        with open(screenshot_filename, "rb") as img_file:
+            img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
+
+        if len(messages) == 1:
+            user_prompt = get_user_first_message_prompt()
+        else:
+            user_prompt = get_user_prompt()
+
+        vision_message = {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": user_prompt},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
+                },
+            ],
+        }
+        messages.append(vision_message)
+
+        response = client.chat.completions.create(
+            model="gpt-5-mini",
+            messages=messages,
+        )
+
+        content = response.choices[0].message.content
+
+        content = clean_json(content)
+
+        content_str = content
+
+        content = json.loads(content)
+
+        processed_content = []
+
+        for operation in content:
+            if operation.get("operation") == "click":
+                text_to_click = operation.get("text")
+                if config.verbose:
+                    print(
+                        "[call_gpt_5_mini_with_ocr][click] text_to_click",
+                        text_to_click,
+                    )
+                # Initialize EasyOCR Reader
+                reader = easyocr.Reader(["en"])
+
+                # Read the screenshot
+                result = reader.readtext(screenshot_filename)
+
+                text_element_index = get_text_element(
+                    result, text_to_click, screenshot_filename
+                )
+                coordinates = get_text_coordinates(
+                    result, text_element_index, screenshot_filename
+                )
+
+                # add `coordinates`` to `content`
+                operation["x"] = coordinates["x"]
+                operation["y"] = coordinates["y"]
+
+                if config.verbose:
+                    print(
+                        "[call_gpt_5_mini_with_ocr][click] text_element_index",
+                        text_element_index,
+                    )
+                    print(
+                        "[call_gpt_5_mini_with_ocr][click] coordinates",
+                        coordinates,
+                    )
+                    print(
+                        "[call_gpt_5_mini_with_ocr][click] final operation",
+                        operation,
+                    )
+                processed_content.append(operation)
+
+            else:
+                processed_content.append(operation)
+
+        # wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history
+        assistant_message = {"role": "assistant", "content": content_str}
+        messages.append(assistant_message)
+
+        return processed_content
+
+    except Exception as e:
+        print(
+            f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying another method {ANSI_RESET}"
+        )
+        if config.verbose:
+            print("[Self-Operating Computer][Operate] error", e)
+            traceback.print_exc()
+        return gpt_4_fallback(messages, objective, model)
+
+
 async def call_gpt_4o_labeled(messages, objective, model):
     time.sleep(1)
 
diff --git a/requirements.txt b/requirements.txt
index c7a646be..5b8c2f71 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,8 +19,8 @@ kiwisolver==1.4.5
 matplotlib==3.8.1
 MouseInfo==0.1.3
 mss==9.0.1
-numpy==1.26.1
-openai==1.2.3
+numpy>=1.26.1
+openai>=1.50.0
 packaging==23.2
 Pillow==10.1.0
 prompt-toolkit==3.0.39
@@ -43,7 +43,7 @@ rubicon-objc==0.4.7
 six==1.16.0
 sniffio==1.3.0
 tqdm==4.66.1
-typing_extensions==4.8.0
+typing_extensions>=4.11
 urllib3==2.0.7
 wcwidth==0.2.9
 zipp==3.17.0
diff --git a/setup.py b/setup.py
index 6cdf6163..53d7cefe 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@
 
 setup(
     name="self-operating-computer",
-    version="1.5.8",
+    version="1.6.0",
     packages=find_packages(),
     install_requires=required,  # Add dependencies here
     entry_points={