Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion compose.production.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ services:
# This job runs various monitoring/grafana checks across the entire cluster.
# It has access to the other nodes via the docker socket.
monitoring:
profiles: ["ol-web0", "ol-web1", "ol-web2", "ol-covers0", "ol-www0", "ol-solr0", "ol-solr1"]
profiles: ["ol-web0", "ol-web1", "ol-web2", "ol-covers0", "ol-www0", "ol-solr0", "ol-solr1", "ol-home0"]
build:
context: .
dockerfile: scripts/monitoring/Dockerfile
Expand Down
9 changes: 9 additions & 0 deletions scripts/monitoring/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,15 @@ async def monitor_solr():
)


@limit_server(["ol-home0"], scheduler)
@scheduler.scheduled_job('interval', seconds=60)
async def monitor_solr_updater():
from scripts.monitoring.solr_updater_monitor import monitor_solr_updater

await monitor_solr_updater()
await monitor_solr_updater(solr_next=True)


@limit_server(["ol-www0"], scheduler)
@scheduler.scheduled_job('interval', seconds=60)
async def monitor_empty_homepage():
Expand Down
70 changes: 70 additions & 0 deletions scripts/monitoring/solr_updater_monitor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import time
from datetime import datetime

import httpx

from scripts.monitoring.haproxy_monitor import GraphiteEvent
from scripts.monitoring.utils import bash_run


async def monitor_solr_updater(
solr_next: bool = False,
dry_run: bool = False,
):
if solr_next:
offset_file = '/solr-updater-data/solr-next-update.offset'
container_name = 'openlibrary-solr-next-updater-1'
bucket = 'stats.ol.solr-next-updater'
else:
offset_file = '/solr-updater-data/solr-update.offset'
container_name = 'openlibrary-solr-updater-1'
bucket = 'stats.ol.solr-updater'

# Check if container running
docker_ps = bash_run("docker ps --format '{{.Names}}'", capture_output=True)
if container_name not in docker_ps.splitlines():
print(f"[OL-MONITOR] Container {container_name} not running.", flush=True)
return

solr_offset = bash_run(
# Note they share a volume mount, so can exec into either container to read
# the offset file.
f'docker exec openlibrary-solr-updater-1 cat {offset_file}',
capture_output=True,
)

async with httpx.AsyncClient() as client:
response = await client.get(
# Note have to access infobase via ol-home to avoid circular routing
# issues in docker network.
f"http://ol-home.us.archive.org:7000/openlibrary.org/log/{solr_offset}?limit=1"
)
response.raise_for_status()
data = response.json()
if not data["data"]:
print(
"[OL-MONITOR] No data returned from solr-updater log endpoint.",
flush=True,
)
return

# e.g. 2025-09-20T03:40:33.670905
timestamp = data["data"][0]["timestamp"]
timestamp_parsed = datetime.fromisoformat(timestamp)
now = time.time()
elapsed_seconds = int(now - timestamp_parsed.timestamp())

event = GraphiteEvent(
path=f"{bucket}.seconds_behind",
value=elapsed_seconds,
timestamp=int(now),
)
print(event.serialize_str())
if not dry_run:
event.submit('graphite.us.archive.org:2004')


if __name__ == "__main__":
from scripts.solr_builder.solr_builder.fn_to_cli import FnToCLI

FnToCLI(monitor_solr_updater).run()
94 changes: 94 additions & 0 deletions scripts/monitoring/tests/test_solr_updater_monitor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import json
import re
from datetime import datetime
from unittest.mock import patch

import pytest

from scripts.monitoring.solr_updater_monitor import monitor_solr_updater

DUMMY_OFFSET = "2025-09-20:49255084"


def patch_bash_run(commands: dict[str | re.Pattern[str], str]):
def fake_bash_run(cmd, capture_output=False, sources=None):
for pattern, response in commands.items():
if (
isinstance(pattern, re.Pattern) and pattern.match(cmd)
) or pattern == cmd:
return response if capture_output else None
raise AssertionError(f"Unexpected bash_run call: {cmd}")

return patch('scripts.monitoring.solr_updater_monitor.bash_run', fake_bash_run)


def patch_httpx_get(responses: dict[str | re.Pattern[str], str]):
class FakeResponse:
def __init__(self, text):
self._text = text

def raise_for_status(self):
pass

def json(self):
return json.loads(self._text)

async def fake_httpx_get(client, url, *args, **kwargs):
for pattern, response in responses.items():
if (
isinstance(pattern, re.Pattern) and pattern.match(url)
) or pattern == url:
return FakeResponse(response)
raise AssertionError(f"Unexpected httpx.get call: {url}")

return patch(
'scripts.monitoring.solr_updater_monitor.httpx.AsyncClient.get', fake_httpx_get
)


@pytest.mark.asyncio
async def test_container_not_running(capsys):
fake_bash: dict[str | re.Pattern[str], str] = {
re.compile(r'^docker ps .*'): "some-other-container\nopenlibrary-web-1",
}
with patch_bash_run(fake_bash):
await monitor_solr_updater(dry_run=True)

out = capsys.readouterr().out
assert "Container openlibrary-solr-updater-1 not running" in out


@pytest.mark.asyncio
async def test_container_running(capsys):
fake_bash: dict[str | re.Pattern[str], str] = {
re.compile(r'^docker ps .*'): "openlibrary-solr-updater-1",
re.compile(r'^docker exec .*'): DUMMY_OFFSET,
}
fake_requests: dict[str | re.Pattern[str], str] = {
f'http://ol-home.us.archive.org:7000/openlibrary.org/log/{DUMMY_OFFSET}?limit=1': json.dumps(
{
"data": [
{
"action": "store.put",
"site": "openlibrary.org",
"timestamp": "2025-09-20T14:28:56.908366",
"data": {
# ...
},
}
],
"offset": "2025-09-20:94419809",
}
)
}
now = datetime.fromisoformat("2025-09-20T14:28:56.908366").timestamp() + 64

with (
patch_bash_run(fake_bash),
patch_httpx_get(fake_requests),
patch('scripts.monitoring.solr_updater_monitor.time.time', return_value=now),
):
await monitor_solr_updater(dry_run=True)

out = capsys.readouterr().out
assert out == f'stats.ol.solr-updater.seconds_behind 64 {int(now)}\n'
4 changes: 2 additions & 2 deletions scripts/monitoring/tests/test_utils_sh.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
def test_bash_run():
# Test without sources
output = bash_run("echo 'Hello, World!'", capture_output=True)
assert output.stdout.strip() == "Hello, World!"
assert output == "Hello, World!"

# Test with sources
with (
Expand All @@ -23,7 +23,7 @@ def test_bash_run():
sources=[source1.name, source2.name],
capture_output=True,
)
assert output.stdout.strip() == "source1 source2"
assert output == "source1 source2"


def test_log_recent_bot_traffic():
Expand Down
18 changes: 17 additions & 1 deletion scripts/monitoring/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,23 @@
import os
import subprocess
from pathlib import Path
from typing import Literal, overload

from apscheduler.schedulers.asyncio import AsyncIOScheduler


@overload
def bash_run(
cmd: str,
sources: list[str | Path] | None = None,
capture_output: Literal[True] = True,
) -> str: ...
@overload
def bash_run(
cmd: str,
sources: list[str | Path] | None = None,
capture_output: Literal[False] = False,
) -> None: ...
def bash_run(cmd: str, sources: list[str | Path] | None = None, capture_output=False):
if not sources:
sources = []
Expand All @@ -26,7 +39,7 @@ def bash_run(cmd: str, sources: list[str | Path] | None = None, capture_output=F
)
)

return subprocess.run(
p = subprocess.run(
[
"bash",
"-c",
Expand All @@ -38,6 +51,9 @@ def bash_run(cmd: str, sources: list[str | Path] | None = None, capture_output=F
text=capture_output if capture_output else None,
)

if capture_output:
return p.stdout.strip()


def limit_server(allowed_servers: list[str], scheduler: AsyncIOScheduler):
"""
Expand Down
8 changes: 4 additions & 4 deletions scripts/tests/test_obfi_sh.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def test_obfi_match_range_multi_day(self):
start = iso_to_ms(start_iso)
end = iso_to_ms(end_iso)
output = bash_w_obfi(f"cat {SAMPLE_LOG} | obfi_match_range {start} {end}")
assert output.stdout == SAMPLE_LOG.read_text()
assert output == SAMPLE_LOG.read_text().strip()

def test_obfi_match_range_first_hour(self):
# Only lines on 2025-06-27 between 00:00:00 and 00:59:59 should match (first line only)
Expand All @@ -38,7 +38,7 @@ def test_obfi_match_range_first_hour(self):
end = iso_to_ms(end_iso)
output = bash_w_obfi(f"cat {SAMPLE_LOG} | obfi_match_range {start} {end}")
expected = SAMPLE_LOG.read_text().splitlines()[0] + '\n'
assert output.stdout == expected
assert output == expected.strip()

def test_obfi_match_range_second_day_same_hour(self):
# Only lines on 2025-06-28 between 00:00:00 and 00:59:59 should match (third line only)
Expand All @@ -48,12 +48,12 @@ def test_obfi_match_range_second_day_same_hour(self):
end = iso_to_ms(end_iso)
output = bash_w_obfi(f"cat {SAMPLE_LOG} | obfi_match_range {start} {end}")
expected = SAMPLE_LOG.read_text().splitlines()[2] + '\n'
assert output.stdout == expected
assert output == expected.strip()

def test_obfi_match_range_no_lines(self):
start_iso = '2001-09-09T01:46:40Z'
end_iso = '2001-09-09T01:46:41Z'
start = iso_to_ms(start_iso)
end = iso_to_ms(end_iso)
output = bash_w_obfi(f"cat {SAMPLE_LOG} | obfi_match_range {start} {end}")
assert output.stdout == ''
assert output == ''
Loading