internetarchive · cdrini · Sep 25, 2025
diff --git a/compose.production.yaml b/compose.production.yaml
@@ -318,7 +318,7 @@ services:
   # This job runs various monitoring/grafana checks across the entire cluster.
   # It has access to the other nodes via the docker socket.
   monitoring:
-    profiles: ["ol-web0", "ol-web1", "ol-web2", "ol-covers0", "ol-www0", "ol-solr0", "ol-solr1"]
+    profiles: ["ol-web0", "ol-web1", "ol-web2", "ol-covers0", "ol-www0", "ol-solr0", "ol-solr1", "ol-home0"]
     build:
       context: .
       dockerfile: scripts/monitoring/Dockerfile

diff --git a/scripts/monitoring/monitor.py b/scripts/monitoring/monitor.py
@@ -95,6 +95,15 @@ async def monitor_solr():
     )
 
 
+@limit_server(["ol-home0"], scheduler)
+@scheduler.scheduled_job('interval', seconds=60)
+async def monitor_solr_updater():
+    from scripts.monitoring.solr_updater_monitor import monitor_solr_updater
+
+    await monitor_solr_updater()
+    await monitor_solr_updater(solr_next=True)
+
+
 @limit_server(["ol-www0"], scheduler)
 @scheduler.scheduled_job('interval', seconds=60)
 async def monitor_empty_homepage():

diff --git a/scripts/monitoring/solr_updater_monitor.py b/scripts/monitoring/solr_updater_monitor.py
@@ -0,0 +1,70 @@
+import time
+from datetime import datetime
+
+import httpx
+
+from scripts.monitoring.haproxy_monitor import GraphiteEvent
+from scripts.monitoring.utils import bash_run
+
+
+async def monitor_solr_updater(
+    solr_next: bool = False,
+    dry_run: bool = False,
+):
+    if solr_next:
+        offset_file = '/solr-updater-data/solr-next-update.offset'
+        container_name = 'openlibrary-solr-next-updater-1'
+        bucket = 'stats.ol.solr-next-updater'
+    else:
+        offset_file = '/solr-updater-data/solr-update.offset'
+        container_name = 'openlibrary-solr-updater-1'
+        bucket = 'stats.ol.solr-updater'
+
+    # Check if container running
+    docker_ps = bash_run("docker ps --format '{{.Names}}'", capture_output=True)
+    if container_name not in docker_ps.splitlines():
+        print(f"[OL-MONITOR] Container {container_name} not running.", flush=True)
+        return
+
+    solr_offset = bash_run(
+        # Note they share a volume mount, so can exec into either container to read
+        # the offset file.
+        f'docker exec openlibrary-solr-updater-1 cat {offset_file}',
+        capture_output=True,
+    )
+
+    async with httpx.AsyncClient() as client:
+        response = await client.get(
+            # Note have to access infobase via ol-home to avoid circular routing
+            # issues in docker network.
+            f"http://ol-home.us.archive.org:7000/openlibrary.org/log/{solr_offset}?limit=1"
+        )
+        response.raise_for_status()
+        data = response.json()
+        if not data["data"]:
+            print(
+                "[OL-MONITOR] No data returned from solr-updater log endpoint.",
+                flush=True,
+            )
+            return
+
+        # e.g. 2025-09-20T03:40:33.670905
+        timestamp = data["data"][0]["timestamp"]
+        timestamp_parsed = datetime.fromisoformat(timestamp)
+        now = time.time()
+        elapsed_seconds = int(now - timestamp_parsed.timestamp())
+
+        event = GraphiteEvent(
+            path=f"{bucket}.seconds_behind",
+            value=elapsed_seconds,
+            timestamp=int(now),
+        )
+        print(event.serialize_str())
+        if not dry_run:
+            event.submit('graphite.us.archive.org:2004')
+
+
+if __name__ == "__main__":
+    from scripts.solr_builder.solr_builder.fn_to_cli import FnToCLI
+
+    FnToCLI(monitor_solr_updater).run()
diff --git a/scripts/monitoring/tests/test_solr_updater_monitor.py b/scripts/monitoring/tests/test_solr_updater_monitor.py
@@ -0,0 +1,94 @@
+import json
+import re
+from datetime import datetime
+from unittest.mock import patch
+
+import pytest
+
+from scripts.monitoring.solr_updater_monitor import monitor_solr_updater
+
+DUMMY_OFFSET = "2025-09-20:49255084"
+
+
+def patch_bash_run(commands: dict[str | re.Pattern[str], str]):
+    def fake_bash_run(cmd, capture_output=False, sources=None):
+        for pattern, response in commands.items():
+            if (
+                isinstance(pattern, re.Pattern) and pattern.match(cmd)
+            ) or pattern == cmd:
+                return response if capture_output else None
+        raise AssertionError(f"Unexpected bash_run call: {cmd}")
+
+    return patch('scripts.monitoring.solr_updater_monitor.bash_run', fake_bash_run)
+
+
+def patch_httpx_get(responses: dict[str | re.Pattern[str], str]):
+    class FakeResponse:
+        def __init__(self, text):
+            self._text = text
+
+        def raise_for_status(self):
+            pass
+
+        def json(self):
+            return json.loads(self._text)
+
+    async def fake_httpx_get(client, url, *args, **kwargs):
+        for pattern, response in responses.items():
+            if (
+                isinstance(pattern, re.Pattern) and pattern.match(url)
+            ) or pattern == url:
+                return FakeResponse(response)
+        raise AssertionError(f"Unexpected httpx.get call: {url}")
+
+    return patch(
+        'scripts.monitoring.solr_updater_monitor.httpx.AsyncClient.get', fake_httpx_get
+    )
+
+
+@pytest.mark.asyncio
+async def test_container_not_running(capsys):
+    fake_bash: dict[str | re.Pattern[str], str] = {
+        re.compile(r'^docker ps .*'): "some-other-container\nopenlibrary-web-1",
+    }
+    with patch_bash_run(fake_bash):
+        await monitor_solr_updater(dry_run=True)
+
+    out = capsys.readouterr().out
+    assert "Container openlibrary-solr-updater-1 not running" in out
+
+
+@pytest.mark.asyncio
+async def test_container_running(capsys):
+    fake_bash: dict[str | re.Pattern[str], str] = {
+        re.compile(r'^docker ps .*'): "openlibrary-solr-updater-1",
+        re.compile(r'^docker exec .*'): DUMMY_OFFSET,
+    }
+    fake_requests: dict[str | re.Pattern[str], str] = {
+        f'http://ol-home.us.archive.org:7000/openlibrary.org/log/{DUMMY_OFFSET}?limit=1': json.dumps(
+            {
+                "data": [
+                    {
+                        "action": "store.put",
+                        "site": "openlibrary.org",
+                        "timestamp": "2025-09-20T14:28:56.908366",
+                        "data": {
+                            # ...
+                        },
+                    }
+                ],
+                "offset": "2025-09-20:94419809",
+            }
+        )
+    }
+    now = datetime.fromisoformat("2025-09-20T14:28:56.908366").timestamp() + 64
+
+    with (
+        patch_bash_run(fake_bash),
+        patch_httpx_get(fake_requests),
+        patch('scripts.monitoring.solr_updater_monitor.time.time', return_value=now),
+    ):
+        await monitor_solr_updater(dry_run=True)
+
+    out = capsys.readouterr().out
+    assert out == f'stats.ol.solr-updater.seconds_behind 64 {int(now)}\n'
diff --git a/scripts/monitoring/tests/test_utils_sh.py b/scripts/monitoring/tests/test_utils_sh.py
@@ -6,7 +6,7 @@
 def test_bash_run():
     # Test without sources
     output = bash_run("echo 'Hello, World!'", capture_output=True)
-    assert output.stdout.strip() == "Hello, World!"
+    assert output == "Hello, World!"
 
     # Test with sources
     with (
@@ -23,7 +23,7 @@ def test_bash_run():
             sources=[source1.name, source2.name],
             capture_output=True,
         )
-        assert output.stdout.strip() == "source1 source2"
+        assert output == "source1 source2"
 
 
 def test_log_recent_bot_traffic():

diff --git a/scripts/monitoring/utils.py b/scripts/monitoring/utils.py
@@ -2,10 +2,23 @@
 import os
 import subprocess
 from pathlib import Path
+from typing import Literal, overload
 
 from apscheduler.schedulers.asyncio import AsyncIOScheduler
 
 
+@overload
+def bash_run(
+    cmd: str,
+    sources: list[str | Path] | None = None,
+    capture_output: Literal[True] = True,
+) -> str: ...
+@overload
+def bash_run(
+    cmd: str,
+    sources: list[str | Path] | None = None,
+    capture_output: Literal[False] = False,
+) -> None: ...
 def bash_run(cmd: str, sources: list[str | Path] | None = None, capture_output=False):
     if not sources:
         sources = []
@@ -26,7 +39,7 @@ def bash_run(cmd: str, sources: list[str | Path] | None = None, capture_output=F
         )
     )
 
-    return subprocess.run(
+    p = subprocess.run(
         [
             "bash",
             "-c",
@@ -38,6 +51,9 @@ def bash_run(cmd: str, sources: list[str | Path] | None = None, capture_output=F
         text=capture_output if capture_output else None,
     )
 
+    if capture_output:
+        return p.stdout.strip()
+
 
 def limit_server(allowed_servers: list[str], scheduler: AsyncIOScheduler):
     """

diff --git a/scripts/tests/test_obfi_sh.py b/scripts/tests/test_obfi_sh.py
@@ -28,7 +28,7 @@ def test_obfi_match_range_multi_day(self):
         start = iso_to_ms(start_iso)
         end = iso_to_ms(end_iso)
         output = bash_w_obfi(f"cat {SAMPLE_LOG} | obfi_match_range {start} {end}")
-        assert output.stdout == SAMPLE_LOG.read_text()
+        assert output == SAMPLE_LOG.read_text().strip()
 
     def test_obfi_match_range_first_hour(self):
         # Only lines on 2025-06-27 between 00:00:00 and 00:59:59 should match (first line only)
@@ -38,7 +38,7 @@ def test_obfi_match_range_first_hour(self):
         end = iso_to_ms(end_iso)
         output = bash_w_obfi(f"cat {SAMPLE_LOG} | obfi_match_range {start} {end}")
         expected = SAMPLE_LOG.read_text().splitlines()[0] + '\n'
-        assert output.stdout == expected
+        assert output == expected.strip()
 
     def test_obfi_match_range_second_day_same_hour(self):
         # Only lines on 2025-06-28 between 00:00:00 and 00:59:59 should match (third line only)
@@ -48,12 +48,12 @@ def test_obfi_match_range_second_day_same_hour(self):
         end = iso_to_ms(end_iso)
         output = bash_w_obfi(f"cat {SAMPLE_LOG} | obfi_match_range {start} {end}")
         expected = SAMPLE_LOG.read_text().splitlines()[2] + '\n'
-        assert output.stdout == expected
+        assert output == expected.strip()
 
     def test_obfi_match_range_no_lines(self):
         start_iso = '2001-09-09T01:46:40Z'
         end_iso = '2001-09-09T01:46:41Z'
         start = iso_to_ms(start_iso)
         end = iso_to_ms(end_iso)
         output = bash_w_obfi(f"cat {SAMPLE_LOG} | obfi_match_range {start} {end}")
-        assert output.stdout == ''
+        assert output == ''