Skip to content

Commit 4940898

Browse files
committed
Add solr-updater delay monitoring
1 parent 10ed6be commit 4940898

File tree

7 files changed

+197
-8
lines changed

7 files changed

+197
-8
lines changed

compose.production.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,7 @@ services:
318318
# This job runs various monitoring/grafana checks across the entire cluster.
319319
# It has access to the other nodes via the docker socket.
320320
monitoring:
321-
profiles: ["ol-web0", "ol-web1", "ol-web2", "ol-covers0", "ol-www0", "ol-solr0", "ol-solr1"]
321+
profiles: ["ol-web0", "ol-web1", "ol-web2", "ol-covers0", "ol-www0", "ol-solr0", "ol-solr1", "ol-home0"]
322322
build:
323323
context: .
324324
dockerfile: scripts/monitoring/Dockerfile

scripts/monitoring/monitor.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,15 @@ async def monitor_solr():
9595
)
9696

9797

98+
@limit_server(["ol-home0"], scheduler)
99+
@scheduler.scheduled_job('interval', seconds=60)
100+
async def monitor_solr_updater():
101+
from scripts.monitoring.solr_updater_monitor import monitor_solr_updater
102+
103+
await monitor_solr_updater()
104+
await monitor_solr_updater(solr_next=True)
105+
106+
98107
@limit_server(["ol-www0"], scheduler)
99108
@scheduler.scheduled_job('interval', seconds=60)
100109
async def monitor_empty_homepage():
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import time
2+
from datetime import datetime
3+
4+
import httpx
5+
6+
from scripts.monitoring.haproxy_monitor import GraphiteEvent
7+
from scripts.monitoring.utils import bash_run
8+
9+
10+
async def monitor_solr_updater(
11+
solr_next: bool = False,
12+
dry_run: bool = False,
13+
):
14+
if solr_next:
15+
offset_file = '/solr-updater-data/solr-next-update.offset'
16+
container_name = 'openlibrary-solr-next-updater-1'
17+
bucket = 'stats.ol.solr-next-updater'
18+
else:
19+
offset_file = '/solr-updater-data/solr-update.offset'
20+
container_name = 'openlibrary-solr-updater-1'
21+
bucket = 'stats.ol.solr-updater'
22+
23+
# Check if container running
24+
docker_ps = bash_run("docker ps --format '{{.Names}}'", capture_output=True)
25+
if container_name not in docker_ps.splitlines():
26+
print(f"[OL-MONITOR] Container {container_name} not running.", flush=True)
27+
return
28+
29+
solr_offset = bash_run(
30+
# Note they share a volume mount, so can exec into either container to read
31+
# the offset file.
32+
f'docker exec openlibrary-solr-updater-1 cat {offset_file}',
33+
capture_output=True,
34+
)
35+
36+
async with httpx.AsyncClient() as client:
37+
response = await client.get(
38+
# Note have to access infobase via ol-home to avoid circular routing
39+
# issues in docker network.
40+
f"http://ol-home.us.archive.org:7000/openlibrary.org/log/{solr_offset}?limit=1"
41+
)
42+
response.raise_for_status()
43+
data = response.json()
44+
if not data["data"]:
45+
print(
46+
"[OL-MONITOR] No data returned from solr-updater log endpoint.",
47+
flush=True,
48+
)
49+
return
50+
51+
# e.g. 2025-09-20T03:40:33.670905
52+
timestamp = data["data"][0]["timestamp"]
53+
timestamp_parsed = datetime.fromisoformat(timestamp)
54+
now = time.time()
55+
elapsed_seconds = int(now - timestamp_parsed.timestamp())
56+
57+
event = GraphiteEvent(
58+
path=f"{bucket}.seconds_behind",
59+
value=elapsed_seconds,
60+
timestamp=int(now),
61+
)
62+
print(event.serialize_str())
63+
if not dry_run:
64+
event.submit('graphite.us.archive.org:2004')
65+
66+
67+
if __name__ == "__main__":
68+
from scripts.solr_builder.solr_builder.fn_to_cli import FnToCLI
69+
70+
FnToCLI(monitor_solr_updater).run()
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
import json
2+
import re
3+
from datetime import datetime
4+
from unittest.mock import patch
5+
6+
import pytest
7+
8+
from scripts.monitoring.solr_updater_monitor import monitor_solr_updater
9+
10+
DUMMY_OFFSET = "2025-09-20:49255084"
11+
12+
13+
def patch_bash_run(commands: dict[str | re.Pattern[str], str]):
14+
def fake_bash_run(cmd, capture_output=False, sources=None):
15+
for pattern, response in commands.items():
16+
if (
17+
isinstance(pattern, re.Pattern) and pattern.match(cmd)
18+
) or pattern == cmd:
19+
return response if capture_output else None
20+
raise AssertionError(f"Unexpected bash_run call: {cmd}")
21+
22+
return patch('scripts.monitoring.solr_updater_monitor.bash_run', fake_bash_run)
23+
24+
25+
def patch_httpx_get(responses: dict[str | re.Pattern[str], str]):
26+
class FakeResponse:
27+
def __init__(self, text):
28+
self._text = text
29+
30+
def raise_for_status(self):
31+
pass
32+
33+
def json(self):
34+
return json.loads(self._text)
35+
36+
async def fake_httpx_get(client, url, *args, **kwargs):
37+
for pattern, response in responses.items():
38+
if (
39+
isinstance(pattern, re.Pattern) and pattern.match(url)
40+
) or pattern == url:
41+
return FakeResponse(response)
42+
raise AssertionError(f"Unexpected httpx.get call: {url}")
43+
44+
return patch(
45+
'scripts.monitoring.solr_updater_monitor.httpx.AsyncClient.get', fake_httpx_get
46+
)
47+
48+
49+
@pytest.mark.asyncio
50+
async def test_container_not_running(capsys):
51+
fake_bash: dict[str | re.Pattern[str], str] = {
52+
re.compile(r'^docker ps .*'): "some-other-container\nopenlibrary-web-1",
53+
}
54+
with patch_bash_run(fake_bash):
55+
await monitor_solr_updater(dry_run=True)
56+
57+
out = capsys.readouterr().out
58+
assert "Container openlibrary-solr-updater-1 not running" in out
59+
60+
61+
@pytest.mark.asyncio
62+
async def test_container_running(capsys):
63+
fake_bash: dict[str | re.Pattern[str], str] = {
64+
re.compile(r'^docker ps .*'): "openlibrary-solr-updater-1",
65+
re.compile(r'^docker exec .*'): DUMMY_OFFSET,
66+
}
67+
fake_requests: dict[str | re.Pattern[str], str] = {
68+
f'http://ol-home.us.archive.org:7000/openlibrary.org/log/{DUMMY_OFFSET}?limit=1': json.dumps(
69+
{
70+
"data": [
71+
{
72+
"action": "store.put",
73+
"site": "openlibrary.org",
74+
"timestamp": "2025-09-20T14:28:56.908366",
75+
"data": {
76+
# ...
77+
},
78+
}
79+
],
80+
"offset": "2025-09-20:94419809",
81+
}
82+
)
83+
}
84+
now = datetime.fromisoformat("2025-09-20T14:28:56.908366").timestamp() + 64
85+
86+
with (
87+
patch_bash_run(fake_bash),
88+
patch_httpx_get(fake_requests),
89+
patch('scripts.monitoring.solr_updater_monitor.time.time', return_value=now),
90+
):
91+
await monitor_solr_updater(dry_run=True)
92+
93+
out = capsys.readouterr().out
94+
assert out == f'stats.ol.solr-updater.seconds_behind 64 {int(now)}\n'

scripts/monitoring/tests/test_utils_sh.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
def test_bash_run():
77
# Test without sources
88
output = bash_run("echo 'Hello, World!'", capture_output=True)
9-
assert output.stdout.strip() == "Hello, World!"
9+
assert output == "Hello, World!"
1010

1111
# Test with sources
1212
with (
@@ -23,7 +23,7 @@ def test_bash_run():
2323
sources=[source1.name, source2.name],
2424
capture_output=True,
2525
)
26-
assert output.stdout.strip() == "source1 source2"
26+
assert output == "source1 source2"
2727

2828

2929
def test_log_recent_bot_traffic():

scripts/monitoring/utils.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,23 @@
22
import os
33
import subprocess
44
from pathlib import Path
5+
from typing import Literal, overload
56

67
from apscheduler.schedulers.asyncio import AsyncIOScheduler
78

89

10+
@overload
11+
def bash_run(
12+
cmd: str,
13+
sources: list[str | Path] | None = None,
14+
capture_output: Literal[True] = True,
15+
) -> str: ...
16+
@overload
17+
def bash_run(
18+
cmd: str,
19+
sources: list[str | Path] | None = None,
20+
capture_output: Literal[False] = False,
21+
) -> None: ...
922
def bash_run(cmd: str, sources: list[str | Path] | None = None, capture_output=False):
1023
if not sources:
1124
sources = []
@@ -26,7 +39,7 @@ def bash_run(cmd: str, sources: list[str | Path] | None = None, capture_output=F
2639
)
2740
)
2841

29-
return subprocess.run(
42+
p = subprocess.run(
3043
[
3144
"bash",
3245
"-c",
@@ -38,6 +51,9 @@ def bash_run(cmd: str, sources: list[str | Path] | None = None, capture_output=F
3851
text=capture_output if capture_output else None,
3952
)
4053

54+
if capture_output:
55+
return p.stdout.strip()
56+
4157

4258
def limit_server(allowed_servers: list[str], scheduler: AsyncIOScheduler):
4359
"""

scripts/tests/test_obfi_sh.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def test_obfi_match_range_multi_day(self):
2828
start = iso_to_ms(start_iso)
2929
end = iso_to_ms(end_iso)
3030
output = bash_w_obfi(f"cat {SAMPLE_LOG} | obfi_match_range {start} {end}")
31-
assert output.stdout == SAMPLE_LOG.read_text()
31+
assert output == SAMPLE_LOG.read_text().strip()
3232

3333
def test_obfi_match_range_first_hour(self):
3434
# Only lines on 2025-06-27 between 00:00:00 and 00:59:59 should match (first line only)
@@ -38,7 +38,7 @@ def test_obfi_match_range_first_hour(self):
3838
end = iso_to_ms(end_iso)
3939
output = bash_w_obfi(f"cat {SAMPLE_LOG} | obfi_match_range {start} {end}")
4040
expected = SAMPLE_LOG.read_text().splitlines()[0] + '\n'
41-
assert output.stdout == expected
41+
assert output == expected.strip()
4242

4343
def test_obfi_match_range_second_day_same_hour(self):
4444
# Only lines on 2025-06-28 between 00:00:00 and 00:59:59 should match (third line only)
@@ -48,12 +48,12 @@ def test_obfi_match_range_second_day_same_hour(self):
4848
end = iso_to_ms(end_iso)
4949
output = bash_w_obfi(f"cat {SAMPLE_LOG} | obfi_match_range {start} {end}")
5050
expected = SAMPLE_LOG.read_text().splitlines()[2] + '\n'
51-
assert output.stdout == expected
51+
assert output == expected.strip()
5252

5353
def test_obfi_match_range_no_lines(self):
5454
start_iso = '2001-09-09T01:46:40Z'
5555
end_iso = '2001-09-09T01:46:41Z'
5656
start = iso_to_ms(start_iso)
5757
end = iso_to_ms(end_iso)
5858
output = bash_w_obfi(f"cat {SAMPLE_LOG} | obfi_match_range {start} {end}")
59-
assert output.stdout == ''
59+
assert output == ''

0 commit comments

Comments
 (0)