Skip to content

Commit 8bd2b9e

Browse files
romangechakaz
andauthored
chore: optimize info command (#4137)
* chore: optimize info command Info command has a large latency when returning all the sections. But often a single section is required. Specifically, SERVER and REPLICATION sections are often fetched by clients or management components. This PR: 1. Removes any hops for "INFO SERVER" command. 2. Removes some redundant stats. 3. Prints latency stats around GetMetrics command if it took to much. Signed-off-by: Roman Gershman <[email protected]> * Update src/server/server_family.cc Co-authored-by: Shahar Mike <[email protected]> Signed-off-by: Roman Gershman <[email protected]> * chore: remove GetMetrics dependency from the REPLICATION section Also, address comments Signed-off-by: Roman Gershman <[email protected]> * fix: clang build --------- Signed-off-by: Roman Gershman <[email protected]> Signed-off-by: Roman Gershman <[email protected]> Co-authored-by: Shahar Mike <[email protected]>
1 parent 8e3b8cc commit 8bd2b9e

File tree

6 files changed

+69
-62
lines changed

6 files changed

+69
-62
lines changed

src/facade/dragonfly_connection.cc

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -630,9 +630,6 @@ void Connection::OnPostMigrateThread() {
630630
stats_ = &tl_facade_stats->conn_stats;
631631
++stats_->num_conns;
632632
stats_->read_buf_capacity += io_buf_.Capacity();
633-
if (cc_->replica_conn) {
634-
++stats_->num_replicas;
635-
}
636633
}
637634

638635
void Connection::OnConnectionStart() {
@@ -1840,10 +1837,6 @@ Connection::MemoryUsage Connection::GetMemoryUsage() const {
18401837
void Connection::DecreaseStatsOnClose() {
18411838
stats_->read_buf_capacity -= io_buf_.Capacity();
18421839

1843-
// Update num_replicas if this was a replica connection.
1844-
if (cc_->replica_conn) {
1845-
--stats_->num_replicas;
1846-
}
18471840
--stats_->num_conns;
18481841
}
18491842

src/facade/facade.cc

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ constexpr size_t kSizeConnStats = sizeof(ConnectionStats);
2020

2121
ConnectionStats& ConnectionStats::operator+=(const ConnectionStats& o) {
2222
// To break this code deliberately if we add/remove a field to this struct.
23-
static_assert(kSizeConnStats == 120u);
23+
static_assert(kSizeConnStats == 112u);
2424

2525
ADD(read_buf_capacity);
2626
ADD(dispatch_queue_entries);
@@ -34,7 +34,6 @@ ConnectionStats& ConnectionStats::operator+=(const ConnectionStats& o) {
3434
ADD(pipelined_cmd_latency);
3535
ADD(conn_received_cnt);
3636
ADD(num_conns);
37-
ADD(num_replicas);
3837
ADD(num_blocked_clients);
3938
ADD(num_migrations);
4039
ADD(pipeline_throttle_count);

src/facade/facade_types.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,6 @@ struct ConnectionStats {
105105
uint64_t conn_received_cnt = 0;
106106

107107
uint32_t num_conns = 0;
108-
uint32_t num_replicas = 0;
109108
uint32_t num_blocked_clients = 0;
110109
uint64_t num_migrations = 0;
111110

src/server/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ using RdbTypeFreqMap = absl::flat_hash_map<unsigned, size_t>;
4747
class CommandId;
4848
class Transaction;
4949
class EngineShard;
50-
class ConnectionState;
50+
struct ConnectionState;
5151
class Interpreter;
5252
class Namespaces;
5353

src/server/server_family.cc

Lines changed: 67 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1275,15 +1275,17 @@ void AppendMetricWithoutLabels(string_view name, string_view help, const absl::A
12751275
AppendMetricValue(name, value, {}, {}, dest);
12761276
}
12771277

1278-
void PrintPrometheusMetrics(const Metrics& m, DflyCmd* dfly_cmd, StringResponse* resp) {
1278+
void PrintPrometheusMetrics(uint64_t uptime, const Metrics& m, DflyCmd* dfly_cmd,
1279+
StringResponse* resp) {
12791280
// Server metrics
12801281
AppendMetricHeader("version", "", MetricType::GAUGE, &resp->body());
12811282
AppendMetricValue("version", 1, {"version"}, {GetVersion()}, &resp->body());
12821283

12831284
bool is_master = ServerState::tlocal()->is_master;
1285+
12841286
AppendMetricWithoutLabels("master", "1 if master 0 if replica", is_master ? 1 : 0,
12851287
MetricType::GAUGE, &resp->body());
1286-
AppendMetricWithoutLabels("uptime_in_seconds", "", m.uptime, MetricType::COUNTER, &resp->body());
1288+
AppendMetricWithoutLabels("uptime_in_seconds", "", uptime, MetricType::COUNTER, &resp->body());
12871289

12881290
// Clients metrics
12891291
const auto& conn_stats = m.facade_stats.conn_stats;
@@ -1377,15 +1379,6 @@ void PrintPrometheusMetrics(const Metrics& m, DflyCmd* dfly_cmd, StringResponse*
13771379
if (added)
13781380
absl::StrAppend(&resp->body(), type_used_memory_metric);
13791381
}
1380-
if (!m.master_side_replicas_info.empty()) {
1381-
ReplicationMemoryStats repl_mem;
1382-
dfly_cmd->GetReplicationMemoryStats(&repl_mem);
1383-
AppendMetricWithoutLabels(
1384-
"replication_streaming_bytes", "Stable sync replication memory usage",
1385-
repl_mem.streamer_buf_capacity_bytes, MetricType::GAUGE, &resp->body());
1386-
AppendMetricWithoutLabels("replication_full_sync_bytes", "Full sync memory usage",
1387-
repl_mem.full_sync_buf_bytes, MetricType::GAUGE, &resp->body());
1388-
}
13891382

13901383
// Stats metrics
13911384
AppendMetricWithoutLabels("connections_received_total", "", conn_stats.conn_received_cnt,
@@ -1458,11 +1451,25 @@ void PrintPrometheusMetrics(const Metrics& m, DflyCmd* dfly_cmd, StringResponse*
14581451
absl::StrAppend(&resp->body(), command_metrics);
14591452
}
14601453

1461-
if (!m.master_side_replicas_info.empty()) {
1454+
if (m.replica_side_info) { // slave side
1455+
auto& replica_info = *m.replica_side_info;
1456+
AppendMetricWithoutLabels("replica_reconnect_count", "Number of replica reconnects",
1457+
replica_info.reconnect_count, MetricType::COUNTER, &resp->body());
1458+
} else { // Master side
14621459
string replication_lag_metrics;
1460+
vector<ReplicaRoleInfo> replicas_info = dfly_cmd->GetReplicasRoleInfo();
1461+
1462+
ReplicationMemoryStats repl_mem;
1463+
dfly_cmd->GetReplicationMemoryStats(&repl_mem);
1464+
AppendMetricWithoutLabels(
1465+
"replication_streaming_bytes", "Stable sync replication memory usage",
1466+
repl_mem.streamer_buf_capacity_bytes, MetricType::GAUGE, &resp->body());
1467+
AppendMetricWithoutLabels("replication_full_sync_bytes", "Full sync memory usage",
1468+
repl_mem.full_sync_buf_bytes, MetricType::GAUGE, &resp->body());
1469+
14631470
AppendMetricHeader("connected_replica_lag_records", "Lag in records of a connected replica.",
14641471
MetricType::GAUGE, &replication_lag_metrics);
1465-
for (const auto& replica : m.master_side_replicas_info) {
1472+
for (const auto& replica : replicas_info) {
14661473
AppendMetricValue("connected_replica_lag_records", replica.lsn_lag,
14671474
{"replica_ip", "replica_port", "replica_state"},
14681475
{replica.address, absl::StrCat(replica.listening_port), replica.state},
@@ -1471,12 +1478,6 @@ void PrintPrometheusMetrics(const Metrics& m, DflyCmd* dfly_cmd, StringResponse*
14711478
absl::StrAppend(&resp->body(), replication_lag_metrics);
14721479
}
14731480

1474-
if (m.replica_side_info) {
1475-
auto& replica_info = *m.replica_side_info;
1476-
AppendMetricWithoutLabels("replica_reconnect_count", "Number of replica reconnects",
1477-
replica_info.reconnect_count, MetricType::COUNTER, &resp->body());
1478-
}
1479-
14801481
AppendMetricWithoutLabels("fiber_switch_total", "", m.fiber_switch_cnt, MetricType::COUNTER,
14811482
&resp->body());
14821483
double delay_seconds = m.fiber_switch_delay_usec * 1e-6;
@@ -1533,7 +1534,8 @@ void ServerFamily::ConfigureMetrics(util::HttpListenerBase* http_base) {
15331534

15341535
auto cb = [this](const util::http::QueryArgs& args, util::HttpContext* send) {
15351536
StringResponse resp = util::http::MakeStringResponse(boost::beast::http::status::ok);
1536-
PrintPrometheusMetrics(this->GetMetrics(&namespaces->GetDefaultNamespace()),
1537+
uint64_t uptime = time(NULL) - start_time_;
1538+
PrintPrometheusMetrics(uptime, this->GetMetrics(&namespaces->GetDefaultNamespace()),
15371539
this->dfly_cmd_.get(), &resp);
15381540

15391541
return send->Invoke(std::move(resp));
@@ -1607,14 +1609,17 @@ void ServerFamily::StatsMC(std::string_view section, SinkReplyBuilder* builder)
16071609

16081610
double utime = dbl_time(ru.ru_utime);
16091611
double systime = dbl_time(ru.ru_stime);
1612+
auto kind = ProactorBase::me()->GetKind();
1613+
const char* multiplex_api = (kind == ProactorBase::IOURING) ? "iouring" : "epoll";
16101614

16111615
Metrics m = GetMetrics(&namespaces->GetDefaultNamespace());
1616+
uint64_t uptime = time(NULL) - start_time_;
16121617

16131618
ADD_LINE(pid, getpid());
1614-
ADD_LINE(uptime, m.uptime);
1619+
ADD_LINE(uptime, uptime);
16151620
ADD_LINE(time, now);
16161621
ADD_LINE(version, kGitTag);
1617-
ADD_LINE(libevent, "iouring");
1622+
ADD_LINE(libevent, multiplex_api);
16181623
ADD_LINE(pointer_size, sizeof(void*));
16191624
ADD_LINE(rusage_user, utime);
16201625
ADD_LINE(rusage_system, systime);
@@ -2083,7 +2088,7 @@ static void MergeDbSliceStats(const DbSlice::Stats& src, Metrics* dest) {
20832088

20842089
void ServerFamily::ResetStat(Namespace* ns) {
20852090
shard_set->pool()->AwaitBrief(
2086-
[registry = service_.mutable_registry(), this, ns](unsigned index, auto*) {
2091+
[registry = service_.mutable_registry(), ns](unsigned index, auto*) {
20872092
registry->ResetCallStats(index);
20882093
ns->GetCurrentDbSlice().ResetEvents();
20892094
facade::ResetStats();
@@ -2095,6 +2100,8 @@ Metrics ServerFamily::GetMetrics(Namespace* ns) const {
20952100
Metrics result;
20962101
util::fb2::Mutex mu;
20972102

2103+
uint64_t start = absl::GetCurrentTimeNanos();
2104+
20982105
auto cmd_stat_cb = [&dest = result.cmd_stats_map](string_view name, const CmdCallStats& stat) {
20992106
auto& [calls, sum] = dest[absl::AsciiStrToLower(name)];
21002107
calls += stat.first;
@@ -2117,7 +2124,6 @@ Metrics ServerFamily::GetMetrics(Namespace* ns) const {
21172124

21182125
result.coordinator_stats.Add(ss->stats);
21192126

2120-
result.uptime = time(NULL) - this->start_time_;
21212127
result.qps += uint64_t(ss->MovingSum6());
21222128
result.facade_stats += *tl_facade_stats;
21232129
result.serialization_bytes += SliceSnapshot::GetThreadLocalMemoryUsage();
@@ -2156,15 +2162,16 @@ Metrics ServerFamily::GetMetrics(Namespace* ns) const {
21562162

21572163
service_.proactor_pool().AwaitFiberOnAll(std::move(cb));
21582164

2165+
uint64_t after_cb = absl::GetCurrentTimeNanos();
2166+
21592167
// Normalize moving average stats
21602168
result.qps /= 6;
21612169
result.traverse_ttl_per_sec /= 6;
21622170
result.delete_ttl_per_sec /= 6;
21632171

21642172
bool is_master = ServerState::tlocal() && ServerState::tlocal()->is_master;
2165-
if (is_master) {
2166-
result.master_side_replicas_info = dfly_cmd_->GetReplicasRoleInfo();
2167-
} else {
2173+
2174+
if (!is_master) {
21682175
auto info = GetReplicaSummary();
21692176
if (info) {
21702177
result.replica_side_info = {
@@ -2187,6 +2194,12 @@ Metrics ServerFamily::GetMetrics(Namespace* ns) const {
21872194

21882195
result.peak_stats = peak_stats_;
21892196

2197+
uint64_t delta_ms = (absl::GetCurrentTimeNanos() - start) / 1'000'000;
2198+
if (delta_ms > 30) {
2199+
uint64_t cb_dur = (after_cb - start) / 1'000'000;
2200+
LOG(INFO) << "GetMetrics took " << delta_ms << " ms, out of which callback took " << cb_dur
2201+
<< " ms";
2202+
}
21902203
return result;
21912204
}
21922205

@@ -2217,15 +2230,6 @@ void ServerFamily::Info(CmdArgList args, Transaction* tx, SinkReplyBuilder* buil
22172230
absl::StrAppend(&info, a1, ":", a2, "\r\n");
22182231
};
22192232

2220-
uint64_t start = absl::GetCurrentTimeNanos();
2221-
Metrics m = GetMetrics(cntx->ns);
2222-
uint64_t delta_ms = (absl::GetCurrentTimeNanos() - start) / 1000'000;
2223-
LOG_IF(INFO, delta_ms > 100) << "GetMetrics took " << delta_ms << " ms";
2224-
2225-
DbStats total;
2226-
for (const auto& db_stats : m.db_stats)
2227-
total += db_stats;
2228-
22292233
if (should_enter("SERVER")) {
22302234
auto kind = ProactorBase::me()->GetKind();
22312235
const char* multiplex_api = (kind == ProactorBase::IOURING) ? "iouring" : "epoll";
@@ -2238,11 +2242,22 @@ void ServerFamily::Info(CmdArgList args, Transaction* tx, SinkReplyBuilder* buil
22382242
append("multiplexing_api", multiplex_api);
22392243
append("tcp_port", GetFlag(FLAGS_port));
22402244
append("thread_count", service_.proactor_pool().size());
2241-
size_t uptime = m.uptime;
2245+
2246+
uint64_t uptime = time(NULL) - start_time_;
22422247
append("uptime_in_seconds", uptime);
22432248
append("uptime_in_days", uptime / (3600 * 24));
22442249
}
22452250

2251+
Metrics m;
2252+
// Save time by not calculating metrics if we don't need them.
2253+
if (!(section == "SERVER" || section == "REPLICATION")) {
2254+
m = GetMetrics(cntx->ns);
2255+
}
2256+
2257+
DbStats total;
2258+
for (const auto& db_stats : m.db_stats)
2259+
total += db_stats;
2260+
22462261
if (should_enter("CLIENTS")) {
22472262
append("connected_clients", m.facade_stats.conn_stats.num_conns);
22482263
append("max_clients", GetFlag(FLAGS_maxclients));
@@ -2310,7 +2325,7 @@ void ServerFamily::Info(CmdArgList args, Transaction* tx, SinkReplyBuilder* buil
23102325
append("maxmemory_policy", "noeviction");
23112326
}
23122327

2313-
if (!m.master_side_replicas_info.empty()) {
2328+
if (!m.replica_side_info) { // master
23142329
ReplicationMemoryStats repl_mem;
23152330
dfly_cmd_->GetReplicationMemoryStats(&repl_mem);
23162331
append("replication_streaming_buffer_bytes", repl_mem.streamer_buf_capacity_bytes);
@@ -2472,17 +2487,23 @@ void ServerFamily::Info(CmdArgList args, Transaction* tx, SinkReplyBuilder* buil
24722487
}
24732488

24742489
if (should_enter("REPLICATION")) {
2475-
util::fb2::LockGuard lk(replicaof_mu_);
2490+
bool is_master = true;
24762491
// Thread local var is_master is updated under mutex replicaof_mu_ together with replica_,
24772492
// ensuring eventual consistency of is_master. When determining if the server is a replica and
24782493
// accessing the replica_ object, we must lock replicaof_mu_. Using is_master alone is
24792494
// insufficient in this scenario.
2480-
if (!replica_) {
2495+
// Please note that we we do not use Metrics object here.
2496+
{
2497+
fb2::LockGuard lk(replicaof_mu_);
2498+
is_master = !replica_;
2499+
}
2500+
if (is_master) {
2501+
vector<ReplicaRoleInfo> replicas_info = dfly_cmd_->GetReplicasRoleInfo();
24812502
append("role", "master");
2482-
append("connected_slaves", m.facade_stats.conn_stats.num_replicas);
2483-
const auto& replicas = m.master_side_replicas_info;
2484-
for (size_t i = 0; i < replicas.size(); i++) {
2485-
auto& r = replicas[i];
2503+
append("connected_slaves", replicas_info.size());
2504+
2505+
for (size_t i = 0; i < replicas_info.size(); i++) {
2506+
auto& r = replicas_info[i];
24862507
// e.g. slave0:ip=172.19.0.3,port=6379,state=full_sync
24872508
append(StrCat("slave", i), StrCat("ip=", r.address, ",port=", r.listening_port,
24882509
",state=", r.state, ",lag=", r.lsn_lag));
@@ -2505,6 +2526,8 @@ void ServerFamily::Info(CmdArgList args, Transaction* tx, SinkReplyBuilder* buil
25052526
append("slave_priority", GetFlag(FLAGS_replica_priority));
25062527
append("slave_read_only", 1);
25072528
};
2529+
fb2::LockGuard lk(replicaof_mu_);
2530+
25082531
replication_info_cb(replica_->GetSummary());
25092532

25102533
// Special case, when multiple masters replicate to a single replica.
@@ -2895,9 +2918,6 @@ void ServerFamily::ReplConf(CmdArgList args, Transaction* tx, SinkReplyBuilder*
28952918
string sync_id = absl::StrCat("SYNC", sid);
28962919
cntx->conn_state.replication_info.repl_session_id = sid;
28972920

2898-
if (!cntx->replica_conn) {
2899-
ServerState::tl_connection_stats()->num_replicas += 1;
2900-
}
29012921
cntx->replica_conn = true;
29022922

29032923
// The response for 'capa dragonfly' is: <masterid> <syncid> <numthreads> <version>

src/server/server_family.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,6 @@ struct Metrics {
8989
ServerState::Stats coordinator_stats; // stats on transaction running
9090
PeakStats peak_stats;
9191

92-
size_t uptime = 0;
9392
size_t qps = 0;
9493

9594
size_t heap_used_bytes = 0;
@@ -119,9 +118,6 @@ struct Metrics {
119118

120119
absl::flat_hash_map<std::string, uint64_t> connections_lib_name_ver_map;
121120

122-
// Replica info on the master side.
123-
std::vector<ReplicaRoleInfo> master_side_replicas_info;
124-
125121
struct ReplicaInfo {
126122
uint32_t reconnect_count;
127123
};

0 commit comments

Comments
 (0)