Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit cde8af9

Browse files
authored
Add config flags to allow for cache auto-tuning (#12701)
1 parent e8ae472 commit cde8af9

File tree

7 files changed

+266
-54
lines changed

7 files changed

+266
-54
lines changed

changelog.d/12701.feature

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add a config options to allow for auto-tuning of caches.

docs/sample_config.yaml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -784,6 +784,24 @@ caches:
784784
#
785785
#cache_entry_ttl: 30m
786786

787+
# This flag enables cache autotuning, and is further specified by the sub-options `max_cache_memory_usage`,
788+
# `target_cache_memory_usage`, `min_cache_ttl`. These flags work in conjunction with each other to maintain
789+
# a balance between cache memory usage and cache entry availability. You must be using jemalloc to utilize
790+
# this option, and all three of the options must be specified for this feature to work.
791+
#cache_autotuning:
792+
# This flag sets a ceiling on much memory the cache can use before caches begin to be continuously evicted.
793+
# They will continue to be evicted until the memory usage drops below the `target_memory_usage`, set in
794+
# the flag below, or until the `min_cache_ttl` is hit.
795+
#max_cache_memory_usage: 1024M
796+
797+
# This flag sets a rough target for the desired memory usage of the caches.
798+
#target_cache_memory_usage: 758M
799+
800+
# 'min_cache_ttl` sets a limit under which newer cache entries are not evicted and is only applied when
801+
# caches are actively being evicted/`max_cache_memory_usage` has been exceeded. This is to protect hot caches
802+
# from being emptied while Synapse is evicting due to memory.
803+
#min_cache_ttl: 5m
804+
787805
# Controls how long the results of a /sync request are cached for after
788806
# a successful response is returned. A higher duration can help clients with
789807
# intermittent connections, at the cost of higher memory usage.

docs/usage/configuration/config_documentation.md

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1119,16 +1119,29 @@ Caching can be configured through the following sub-options:
11191119
with intermittent connections, at the cost of higher memory usage.
11201120
By default, this is zero, which means that sync responses are not cached
11211121
at all.
1122-
1122+
* `cache_autotuning` and its sub-options `max_cache_memory_usage`, `target_cache_memory_usage`, and
1123+
`min_cache_ttl` work in conjunction with each other to maintain a balance between cache memory
1124+
usage and cache entry availability. You must be using [jemalloc](https://github.com/matrix-org/synapse#help-synapse-is-slow-and-eats-all-my-ramcpu)
1125+
to utilize this option, and all three of the options must be specified for this feature to work.
1126+
* `max_cache_memory_usage` sets a ceiling on how much memory the cache can use before caches begin to be continuously evicted.
1127+
They will continue to be evicted until the memory usage drops below the `target_memory_usage`, set in
1128+
the flag below, or until the `min_cache_ttl` is hit.
1129+
* `target_memory_usage` sets a rough target for the desired memory usage of the caches.
1130+
* `min_cache_ttl` sets a limit under which newer cache entries are not evicted and is only applied when
1131+
caches are actively being evicted/`max_cache_memory_usage` has been exceeded. This is to protect hot caches
1132+
from being emptied while Synapse is evicting due to memory.
11231133

11241134
Example configuration:
11251135
```yaml
11261136
caches:
11271137
global_factor: 1.0
11281138
per_cache_factors:
11291139
get_users_who_share_room_with_user: 2.0
1130-
expire_caches: false
11311140
sync_response_cache_duration: 2m
1141+
cache_autotuning:
1142+
max_cache_memory_usage: 1024M
1143+
target_cache_memory_usage: 758M
1144+
min_cache_ttl: 5m
11321145
```
11331146

11341147
### Reloading cache factors

synapse/config/cache.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,24 @@ def generate_config_section(self, **kwargs: Any) -> str:
176176
#
177177
#cache_entry_ttl: 30m
178178
179+
# This flag enables cache autotuning, and is further specified by the sub-options `max_cache_memory_usage`,
180+
# `target_cache_memory_usage`, `min_cache_ttl`. These flags work in conjunction with each other to maintain
181+
# a balance between cache memory usage and cache entry availability. You must be using jemalloc to utilize
182+
# this option, and all three of the options must be specified for this feature to work.
183+
#cache_autotuning:
184+
# This flag sets a ceiling on much memory the cache can use before caches begin to be continuously evicted.
185+
# They will continue to be evicted until the memory usage drops below the `target_memory_usage`, set in
186+
# the flag below, or until the `min_cache_ttl` is hit.
187+
#max_cache_memory_usage: 1024M
188+
189+
# This flag sets a rough target for the desired memory usage of the caches.
190+
#target_cache_memory_usage: 758M
191+
192+
# 'min_cache_ttl` sets a limit under which newer cache entries are not evicted and is only applied when
193+
# caches are actively being evicted/`max_cache_memory_usage` has been exceeded. This is to protect hot caches
194+
# from being emptied while Synapse is evicting due to memory.
195+
#min_cache_ttl: 5m
196+
179197
# Controls how long the results of a /sync request are cached for after
180198
# a successful response is returned. A higher duration can help clients with
181199
# intermittent connections, at the cost of higher memory usage.
@@ -263,6 +281,21 @@ def read_config(self, config: JsonDict, **kwargs: Any) -> None:
263281
)
264282
self.expiry_time_msec = self.parse_duration(expiry_time)
265283

284+
self.cache_autotuning = cache_config.get("cache_autotuning")
285+
if self.cache_autotuning:
286+
max_memory_usage = self.cache_autotuning.get("max_cache_memory_usage")
287+
self.cache_autotuning["max_cache_memory_usage"] = self.parse_size(
288+
max_memory_usage
289+
)
290+
291+
target_mem_size = self.cache_autotuning.get("target_cache_memory_usage")
292+
self.cache_autotuning["target_cache_memory_usage"] = self.parse_size(
293+
target_mem_size
294+
)
295+
296+
min_cache_ttl = self.cache_autotuning.get("min_cache_ttl")
297+
self.cache_autotuning["min_cache_ttl"] = self.parse_duration(min_cache_ttl)
298+
266299
self.sync_response_cache_duration = self.parse_duration(
267300
cache_config.get("sync_response_cache_duration", 0)
268301
)

synapse/metrics/jemalloc.py

Lines changed: 73 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import re
1919
from typing import Iterable, Optional, overload
2020

21+
import attr
2122
from prometheus_client import REGISTRY, Metric
2223
from typing_extensions import Literal
2324

@@ -27,52 +28,24 @@
2728
logger = logging.getLogger(__name__)
2829

2930

30-
def _setup_jemalloc_stats() -> None:
31-
"""Checks to see if jemalloc is loaded, and hooks up a collector to record
32-
statistics exposed by jemalloc.
33-
"""
34-
35-
# Try to find the loaded jemalloc shared library, if any. We need to
36-
# introspect into what is loaded, rather than loading whatever is on the
37-
# path, as if we load a *different* jemalloc version things will seg fault.
38-
39-
# We look in `/proc/self/maps`, which only exists on linux.
40-
if not os.path.exists("/proc/self/maps"):
41-
logger.debug("Not looking for jemalloc as no /proc/self/maps exist")
42-
return
43-
44-
# We're looking for a path at the end of the line that includes
45-
# "libjemalloc".
46-
regex = re.compile(r"/\S+/libjemalloc.*$")
47-
48-
jemalloc_path = None
49-
with open("/proc/self/maps") as f:
50-
for line in f:
51-
match = regex.search(line.strip())
52-
if match:
53-
jemalloc_path = match.group()
54-
55-
if not jemalloc_path:
56-
# No loaded jemalloc was found.
57-
logger.debug("jemalloc not found")
58-
return
59-
60-
logger.debug("Found jemalloc at %s", jemalloc_path)
61-
62-
jemalloc = ctypes.CDLL(jemalloc_path)
31+
@attr.s(slots=True, frozen=True, auto_attribs=True)
32+
class JemallocStats:
33+
jemalloc: ctypes.CDLL
6334

6435
@overload
6536
def _mallctl(
66-
name: str, read: Literal[True] = True, write: Optional[int] = None
37+
self, name: str, read: Literal[True] = True, write: Optional[int] = None
6738
) -> int:
6839
...
6940

7041
@overload
71-
def _mallctl(name: str, read: Literal[False], write: Optional[int] = None) -> None:
42+
def _mallctl(
43+
self, name: str, read: Literal[False], write: Optional[int] = None
44+
) -> None:
7245
...
7346

7447
def _mallctl(
75-
name: str, read: bool = True, write: Optional[int] = None
48+
self, name: str, read: bool = True, write: Optional[int] = None
7649
) -> Optional[int]:
7750
"""Wrapper around `mallctl` for reading and writing integers to
7851
jemalloc.
@@ -120,7 +93,7 @@ def _mallctl(
12093
# Where oldp/oldlenp is a buffer where the old value will be written to
12194
# (if not null), and newp/newlen is the buffer with the new value to set
12295
# (if not null). Note that they're all references *except* newlen.
123-
result = jemalloc.mallctl(
96+
result = self.jemalloc.mallctl(
12497
name.encode("ascii"),
12598
input_var_ref,
12699
input_len_ref,
@@ -136,21 +109,80 @@ def _mallctl(
136109

137110
return input_var.value
138111

139-
def _jemalloc_refresh_stats() -> None:
112+
def refresh_stats(self) -> None:
140113
"""Request that jemalloc updates its internal statistics. This needs to
141114
be called before querying for stats, otherwise it will return stale
142115
values.
143116
"""
144117
try:
145-
_mallctl("epoch", read=False, write=1)
118+
self._mallctl("epoch", read=False, write=1)
146119
except Exception as e:
147120
logger.warning("Failed to reload jemalloc stats: %s", e)
148121

122+
def get_stat(self, name: str) -> int:
123+
"""Request the stat of the given name at the time of the last
124+
`refresh_stats` call. This may throw if we fail to read
125+
the stat.
126+
"""
127+
return self._mallctl(f"stats.{name}")
128+
129+
130+
_JEMALLOC_STATS: Optional[JemallocStats] = None
131+
132+
133+
def get_jemalloc_stats() -> Optional[JemallocStats]:
134+
"""Returns an interface to jemalloc, if it is being used.
135+
136+
Note that this will always return None until `setup_jemalloc_stats` has been
137+
called.
138+
"""
139+
return _JEMALLOC_STATS
140+
141+
142+
def _setup_jemalloc_stats() -> None:
143+
"""Checks to see if jemalloc is loaded, and hooks up a collector to record
144+
statistics exposed by jemalloc.
145+
"""
146+
147+
global _JEMALLOC_STATS
148+
149+
# Try to find the loaded jemalloc shared library, if any. We need to
150+
# introspect into what is loaded, rather than loading whatever is on the
151+
# path, as if we load a *different* jemalloc version things will seg fault.
152+
153+
# We look in `/proc/self/maps`, which only exists on linux.
154+
if not os.path.exists("/proc/self/maps"):
155+
logger.debug("Not looking for jemalloc as no /proc/self/maps exist")
156+
return
157+
158+
# We're looking for a path at the end of the line that includes
159+
# "libjemalloc".
160+
regex = re.compile(r"/\S+/libjemalloc.*$")
161+
162+
jemalloc_path = None
163+
with open("/proc/self/maps") as f:
164+
for line in f:
165+
match = regex.search(line.strip())
166+
if match:
167+
jemalloc_path = match.group()
168+
169+
if not jemalloc_path:
170+
# No loaded jemalloc was found.
171+
logger.debug("jemalloc not found")
172+
return
173+
174+
logger.debug("Found jemalloc at %s", jemalloc_path)
175+
176+
jemalloc_dll = ctypes.CDLL(jemalloc_path)
177+
178+
stats = JemallocStats(jemalloc_dll)
179+
_JEMALLOC_STATS = stats
180+
149181
class JemallocCollector(Collector):
150182
"""Metrics for internal jemalloc stats."""
151183

152184
def collect(self) -> Iterable[Metric]:
153-
_jemalloc_refresh_stats()
185+
stats.refresh_stats()
154186

155187
g = GaugeMetricFamily(
156188
"jemalloc_stats_app_memory_bytes",
@@ -184,7 +216,7 @@ def collect(self) -> Iterable[Metric]:
184216
"metadata",
185217
):
186218
try:
187-
value = _mallctl(f"stats.{t}")
219+
value = stats.get_stat(t)
188220
except Exception as e:
189221
# There was an error fetching the value, skip.
190222
logger.warning("Failed to read jemalloc stats.%s: %s", t, e)

0 commit comments

Comments
 (0)