Open-Security-Mapping-Project · johnseekins · Oct 27, 2025 · Oct 28, 2025 · Oct 28, 2025 · Oct 28, 2025
diff --git a/.config/mise.toml b/.config/mise.toml
@@ -1,2 +1,9 @@
 [tools]
 python = "3.13.3"
+node = "latest"
+lefthook = "latest"
+yamllint = "latest"
+actionlint = "latest"
+shellcheck = "latest"
+markdownlint-cli2 = "latest"
+jq = "latest"
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,11 @@
 __pycache__/
 *.csv
+**/.csv
 *.json
+**/*.json
 *.xlsx
+**/.xlsx
 *.xlsx#
+**/.xlsx#
+*.parquet
+**/.parquet
diff --git a/.lefthook.yml b/.lefthook.yml
@@ -0,0 +1,60 @@
+#   Refer for explanation to following link:
+#   https://lefthook.dev/configuration/
+pre-commit:
+  parallel: true
+  jobs:
+    - name: shell script validation
+      run: shellcheck --shell=bash -x {staged_files}
+      glob:
+        - "*.sh"
+        - "*.zsh"
+        - "*.bash"
+
+    - name: markdown linting
+      run: markdownlint-cli2 --fix --config .markdownlint.json {staged_files}
+      glob:
+        - "*.md"
+      stage_fixed: true
+
+    - name: yaml linting
+      run: yamllint -c .yamllint-config.yaml .
+      glob: "*.y*ml"
+
+    - name: Github Action linting
+      run: actionlint
+      glob:
+        - ".github/workflows/*.y*ml"
+
+    - name: Ruff Formatting
+      run: uv run ruff format -q .
+      glob:
+        - "*.py"
+      stage_fixed: true
+
+    - name: Ruff Syntax checking
+      run: uv run ruff check --fix -q
+      glob:
+        - "*.py"
+      stage_fixed: true
+
+    - name: MyPy type validation
+      run: uv run mypy .
+      glob:
+        - "*.py"
+
+    - name: json validation
+      run: tools/check-json.sh {staged_files}
+      glob:
+        - "*.json"
+
+    - name: check for merge conflicts
+      run: tools/check-merge-conflicts.sh {staged_files}
+
+    - name: check for bad file details
+      run: tools/check-file-details.sh {staged_files}
+      stage_fixed: true
+
+output:
+  - success
+  - failure
+  - summary
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
diff --git a/README.md b/README.md
@@ -78,7 +78,7 @@ Examples of the full setup for some OSes are below:
     eval "$(mise activate bash)"
     pip install --upgrade pip wheel uv
     uv sync
-    uv run pre-commit install
+    lefthook install pre-commit -f
 ```
 
 Another command for installing mise in your session can also work (in bash):
@@ -98,7 +98,7 @@ Another command for installing mise in your session can also work (in bash):
     eval "$(mise activate zsh)"
     pip install --upgrade pip wheel uv
     uv sync
-    uv run pre-commit install
+    lefthook install pre-commit -f
 ```
 
 ## Todo / Known Issues
@@ -115,7 +115,9 @@ seems wrong.
 
 ## Contributing & Code Standards
 
-We have a [.pre-commit-config.yaml](.pre-commit-config.yaml) file which enforces some linting / formatting rules.
+We have a [.lefthook.yml](.lefthook.yml) file which enforces some linting / formatting rules.
+
+We also rely on [ruff](https://docs.astral.sh/ruff/) and [mypy](https://www.mypy-lang.org/) for ensuring python coding standards.
 
 Pull requests and reviews are welcome on the main repo. For checking type safety use [mypy](https://github.com/python/mypy):
 

diff --git a/file_utils.py b/file_utils.py
@@ -1,9 +1,11 @@
 import copy
 import json
+import os
 from schemas import enrichment_print_schema
 from utils import (
     convert_to_dataframe,
     logger,
+    output_folder,
 )
 import xlsxwriter  # type: ignore [import-untyped]
 
@@ -16,8 +18,9 @@ def export_to_file(
     if not facilities_data or not facilities_data.get("facilities", []):
         logger.warning("No data to export!")
         return ""
-
-    full_name = f"{filename}.{file_type}"
+    # make sure the folder we're dropping files into exists
+    os.makedirs(output_folder, exist_ok=True)
+    full_name = f"{output_folder}/{filename}.{file_type}"
     if file_type in ["csv", "xlsx", "parquet"]:
         writer = convert_to_dataframe(facilities_data["facilities"])
         match file_type:
@@ -36,10 +39,9 @@ def export_to_file(
             json.dump(facilities_data, f_out, indent=2, sort_keys=True, default=str)
 
     logger.info(
-        "%s file '%s.%s' created successfully with %s facilities.",
-        file_type,
-        filename,
+        "%s file '%s' created successfully with %s facilities.",
         file_type,
+        full_name,
         len(facilities_data["facilities"]),
     )
     return filename

diff --git a/ice_scrapers/__init__.py b/ice_scrapers/__init__.py
@@ -115,7 +115,7 @@
     "HLG": "Harlingen Field Office",
     "HOU": "Houston Field Office",
     "LOS": "Los Angeles Field Office",
-    "MIA": "Miami Field Office",
+    "MIA": "Miramar Sub Office",
     "NEW": "Newark Field Office",
     "NOL": "New Orleans Field Office",
     "NYC": "New York City Field Office",

diff --git a/ice_scrapers/field_offices.py b/ice_scrapers/field_offices.py
@@ -41,7 +41,7 @@ def scrape_field_offices() -> dict:
     return office_data
 
 
-def _scrape_page(page_url: str) -> list:
+def _scrape_page(page_url: str) -> list[dict]:
     """Scrape a single page of facilities using BeautifulSoup"""
     logger.debug("  Fetching: %s", page_url)
     try:
@@ -120,7 +120,10 @@ def _extract_single_office(element: BeautifulSoup, page_url: str) -> dict:
     field_office = element.select_one(".views-field-title")
     if field_office:
         office["field_office"] = field_office.text.strip()
-        office["id"] = field_office_to_aor[office["field_office"]]
+        try:
+            office["id"] = field_office_to_aor[office["field_office"]]
+        except Exception:
+            logger.warning("Could not attach %s as a field office! Maybe update AORs?", office["field_office"])
     address = element.select_one(".address-line1")
     if address:
         office["address"]["street"] = address.text.strip()

diff --git a/ice_scrapers/spreadsheet_load.py b/ice_scrapers/spreadsheet_load.py
@@ -17,7 +17,6 @@
     facility_schema,
     field_office_schema,
 )
-from typing import Tuple
 from utils import (
     logger,
     session,
@@ -59,7 +58,7 @@
 ]
 
 
-def _download_sheet(keep_sheet: bool = True, force_download: bool = True) -> Tuple[polars.DataFrame, str]:
+def _download_sheet(keep_sheet: bool = True, force_download: bool = True) -> tuple[polars.DataFrame, str]:
     """Download the detention stats sheet from ice.gov"""
     resp = session.get(base_xlsx_url, timeout=120)
     resp.raise_for_status()

diff --git a/ice_scrapers/utils.py b/ice_scrapers/utils.py
@@ -1,6 +1,5 @@
 from bs4 import BeautifulSoup
 import re
-from typing import Tuple
 from utils import (
     logger,
     session,
@@ -35,7 +34,7 @@ def special_facilities(facility: dict) -> dict:
     return facility
 
 
-def repair_name(name: str, locality: str) -> Tuple[str, bool]:
+def repair_name(name: str, locality: str) -> tuple[str, bool]:
     """Even facility names are occasionally bad"""
     matches = [
         {"match": "ALEXANDRIA STAGING FACILI", "replace": "Alexandria Staging Facility", "locality": "ALEXANDRIA"},
@@ -96,7 +95,7 @@ def repair_name(name: str, locality: str) -> Tuple[str, bool]:
     return name, cleaned
 
 
-def repair_street(street: str, locality: str = "") -> Tuple[str, bool]:
+def repair_street(street: str, locality: str = "") -> tuple[str, bool]:
     """Generally, we'll let the spreadsheet win arguments just to be consistent"""
     street_filters = [
         # address mismatch between site and spreadsheet
@@ -217,7 +216,7 @@ def repair_street(street: str, locality: str = "") -> Tuple[str, bool]:
     return street, cleaned
 
 
-def repair_zip(zip_code: int, locality: str) -> Tuple[str, bool]:
+def repair_zip(zip_code: int, locality: str) -> tuple[str, bool]:
     """
     Excel does a cool thing where it strips leading 0s
     Also, many zip codes are mysteriously discordant
@@ -248,7 +247,7 @@ def repair_zip(zip_code: int, locality: str) -> Tuple[str, bool]:
     return zcode, cleaned
 
 
-def repair_locality(locality: str, administrative_area: str) -> Tuple[str, bool]:
+def repair_locality(locality: str, administrative_area: str) -> tuple[str, bool]:
     """
     There is no consistency with any address.
     How the post office ever successfully delivered a letter is beyond me

diff --git a/ice_scrapers/vera_data.py b/ice_scrapers/vera_data.py
@@ -3,7 +3,6 @@
 import os
 import polars
 from schemas import facility_schema
-from typing import Tuple
 from utils import (
     logger,
     session,
@@ -17,7 +16,7 @@
 filename = f"{SCRIPT_DIR}{os.sep}vera_facilities.csv"
 
 
-def _vera_name_fixes(name: str, city: str) -> Tuple[str, bool]:
+def _vera_name_fixes(name: str, city: str) -> tuple[str, bool]:
     """Match Vera names with ice.gov names"""
     matches = [
         {"match": "Adams County", "replace": "Adams County Courthouse", "city": "Ritzville"},
@@ -199,7 +198,7 @@ def _vera_name_fixes(name: str, city: str) -> Tuple[str, bool]:
     return name, fixed
 
 
-def _vera_city_fixes(city: str, state: str) -> Tuple[str, bool]:
+def _vera_city_fixes(city: str, state: str) -> tuple[str, bool]:
     """There are a few cases where getting a state match requires some munging"""
     matches = [
         {"match": "Saipan", "replace": "Susupe, Saipan", "city": "MP"},