Skip to content

Commit a29681e

Browse files
authored
fix spectator.measurements metrics (#125)
* The `spectator.measurements` internal status metrics were not being published on 200 OK responses, only when partial errors were observed. This change fixes that, so that we always publish the sent metrics counter on 200 OK. * The `spectator.measurements` metrics were using the `owner` tag to identify the source of the metrics. Our internal dashboards use the `nf.process` tag for this purpose, so we want to use that instead. * The `ipc.client.call` metrics were using the `owner` tag to identify the source of the metrics. The `owner` tag is better considered to be a way to identify a source library, when there could be many sources of a metric. It is better to use `nf.process` to identify these metrics. * The `spectator.registrySize` metrics were using the `owner` tag to identify the source of the metrics. This was swapped to `nf.process`. * We do not want to set the `nf.process` tag with an environment variable for `spectatord`, because there are many processes that send metrics through it, each of which may want to set that tag independently. Thus, we set the `nf.process` tag manually in the status metrics. * The internal configuration library version was updated, so that we default to the test endpoint for external publishing. * The command line parameter parsing was updated, so that the `--uri` parameter can be used to switch back to the prod endpoint, for external publishing, if there is a need.
1 parent 4330712 commit a29681e

File tree

8 files changed

+96
-49
lines changed

8 files changed

+96
-49
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
CMakeUserPresets.json
66
cmake-build-debug/
77
cmake-build/
8+
conan_provider.cmake
89
metatron/auth_context.pb.cc
910
metatron/auth_context.pb.h
1011
metatron/auth_context.proto

bin/spectatord_main.cc

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -111,17 +111,24 @@ auto main(int argc, char** argv) -> int {
111111

112112
auto cfg = GetSpectatorConfig();
113113

114-
auto maybe_agg_uri = absl::GetFlag(FLAGS_uri);
115-
if (absl::GetFlag(FLAGS_debug)) {
116-
cfg->uri = "https://atlas-aggr-dev.us-east-1.ieptest.netflix.net/api/v4/update";
117-
} else if (!maybe_agg_uri.empty()) {
118-
cfg->uri = std::move(maybe_agg_uri);
119-
} else if (absl::GetFlag(FLAGS_enable_external)) {
114+
if (absl::GetFlag(FLAGS_enable_external)) {
120115
cfg->external_enabled = true;
121116
cfg->metatron_dir = absl::GetFlag(FLAGS_metatron_dir);
117+
// provide a reasonable default for external publishing
122118
cfg->uri = cfg->external_uri;
123119
}
124120

121+
if (!absl::GetFlag(FLAGS_uri).empty()) {
122+
// allow overriding the uri for both standard and external publishing
123+
cfg->uri = absl::GetFlag(FLAGS_uri);
124+
}
125+
126+
if (absl::GetFlag(FLAGS_debug)) {
127+
// when running in debug mode, never publish externally
128+
cfg->external_enabled = false;
129+
cfg->uri = "https://atlas-aggr-dev.us-east-1.ieptest.netflix.net/api/v4/update";
130+
}
131+
125132
if (absl::GetFlag(FLAGS_verbose_http)) {
126133
cfg->verbose_http = true;
127134
}

conanfile.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,12 +90,12 @@ def get_flat_hash_map(self):
9090

9191
def get_netflix_spectator_cppconf(self, nflx_cfg: NflxConfig) -> None:
9292
repo = "corp/cldmta-netflix-spectator-cppconf"
93-
commit = "a76058e12a5e0928cc0a36acb1bfb8e045f9d589"
93+
commit = "aa12add4ba33ac08002573d1a6563cad1b620e08"
9494
zip_name = repo.replace("corp/", "") + f"-{commit}.zip"
9595

9696
self.maybe_remove_file(zip_name)
9797
self.download(nflx_cfg, repo, commit, zip_name)
98-
check_sha256(self, zip_name, "9ffc2190f0ee676ddd9af31fcdd9a05cfbd990674101d7c8958638a0a432bdef")
98+
check_sha256(self, zip_name, "7d72078e5e209ebaa1e7e861edf815c212353bcf2e8c9bf20de5304a2f9a7271")
9999

100100
dir_name = repo.replace("corp/", "")
101101
self.maybe_remove_dir(dir_name)

spectator/http_client_test.cc

Lines changed: 68 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -75,11 +75,15 @@ TEST(HttpTest, Post) {
7575

7676
auto timer_for_req = find_timer(&registry, "ipc.client.call", "200");
7777
ASSERT_TRUE(timer_for_req != nullptr);
78-
auto expected_tags =
79-
Tags{{"owner", "spectatord"}, {"http.status", "200"},
80-
{"http.method", "POST"}, {"ipc.status", "success"},
81-
{"ipc.result", "success"}, {"ipc.endpoint", "/foo"},
82-
{"ipc.attempt.final", "true"}};
78+
auto expected_tags = Tags{
79+
{"http.method", "POST"},
80+
{"http.status", "200"},
81+
{"ipc.attempt.final", "true"},
82+
{"ipc.endpoint", "/foo"},
83+
{"ipc.result", "success"},
84+
{"ipc.status", "success"},
85+
{"nf.process", "spectatord"},
86+
};
8387

8488
auto& actual_tags = timer_for_req->MeterId().GetTags();
8589
auto attempt = actual_tags.at(intern_str("ipc.attempt"));
@@ -133,11 +137,16 @@ TEST(HttpTest, PostUncompressed) {
133137

134138
auto timer_for_req = find_timer(&registry, "ipc.client.call", "200");
135139
ASSERT_TRUE(timer_for_req != nullptr);
136-
auto expected_tags =
137-
Tags{{"owner", "spectatord"}, {"http.status", "200"},
138-
{"http.method", "POST"}, {"ipc.status", "success"},
139-
{"ipc.result", "success"}, {"ipc.attempt", "initial"},
140-
{"ipc.endpoint", "/foo"}, {"ipc.attempt.final", "true"}};
140+
auto expected_tags = Tags{
141+
{"http.method", "POST"},
142+
{"http.status", "200"},
143+
{"ipc.attempt", "initial"},
144+
{"ipc.attempt.final", "true"},
145+
{"ipc.endpoint", "/foo"},
146+
{"ipc.result", "success"},
147+
{"ipc.status", "success"},
148+
{"nf.process", "spectatord"},
149+
};
141150

142151
const auto& actual_tags = timer_for_req->MeterId().GetTags();
143152
EXPECT_EQ(expected_tags, actual_tags);
@@ -181,11 +190,16 @@ TEST(HttpTest, Timeout) {
181190
auto timer_for_req = find_timer(&registry, "ipc.client.call", "-1");
182191
ASSERT_TRUE(timer_for_req != nullptr);
183192

184-
auto expected_tags =
185-
Tags{{"owner", "spectatord"}, {"http.status", "-1"},
186-
{"ipc.result", "failure"}, {"ipc.status", "timeout"},
187-
{"ipc.attempt", "initial"}, {"ipc.attempt.final", "true"},
188-
{"ipc.endpoint", "/foo"}, {"http.method", "POST"}};
193+
auto expected_tags = Tags{
194+
{"http.method", "POST"},
195+
{"http.status", "-1"},
196+
{"ipc.attempt", "initial"},
197+
{"ipc.attempt.final", "true"},
198+
{"ipc.endpoint", "/foo"},
199+
{"ipc.result", "failure"},
200+
{"ipc.status", "timeout"},
201+
{"nf.process", "spectatord"},
202+
};
189203
EXPECT_EQ(expected_tags, timer_for_req->MeterId().GetTags());
190204
}
191205

@@ -262,10 +276,16 @@ TEST(HttpTest, Get) {
262276
EXPECT_EQ(my_counters(registry).size(), 1);
263277

264278
spectator::Tags timer_tags{
265-
{"http.status", "200"}, {"ipc.attempt", "initial"},
266-
{"ipc.result", "success"}, {"owner", "spectatord"},
267-
{"ipc.status", "success"}, {"ipc.attempt.final", "true"},
268-
{"http.method", "GET"}, {"ipc.endpoint", "/get"}};
279+
{"http.method", "GET"},
280+
{"http.status", "200"},
281+
{"ipc.attempt", "initial"},
282+
{"ipc.attempt.final", "true"},
283+
{"ipc.endpoint", "/get"},
284+
{"ipc.result", "success"},
285+
{"ipc.status", "success"},
286+
{"nf.process", "spectatord"},
287+
};
288+
269289
auto timer_id = Id::Of("ipc.client.call", std::move(timer_tags));
270290
auto timer = registry.GetTimer(timer_id);
271291
EXPECT_EQ(timer->Count(), 1);
@@ -294,21 +314,30 @@ TEST(HttpTest, Get503) {
294314
EXPECT_EQ(ipc_meters.size(), 2);
295315

296316
spectator::Tags err_timer_tags{
297-
{"http.status", "503"}, {"ipc.attempt", "initial"},
298-
{"ipc.result", "failure"}, {"owner", "spectatord"},
299-
{"ipc.status", "http_error"}, {"ipc.attempt.final", "false"},
300-
{"http.method", "GET"}, {"ipc.endpoint", "/get503"}};
317+
{"http.method", "GET"},
318+
{"http.status", "503"},
319+
{"ipc.attempt", "initial"},
320+
{"ipc.attempt.final", "false"},
321+
{"ipc.endpoint", "/get503"},
322+
{"ipc.result", "failure"},
323+
{"ipc.status", "http_error"},
324+
{"nf.process", "spectatord"},
325+
};
301326
spectator::Tags success_timer_tags{
302-
{"http.status", "200"}, {"ipc.attempt", "second"},
303-
{"ipc.result", "success"}, {"owner", "spectatord"},
304-
{"ipc.status", "success"}, {"ipc.attempt.final", "true"},
305-
{"http.method", "GET"}, {"ipc.endpoint", "/get503"}};
327+
{"http.method", "GET"},
328+
{"http.status", "200"},
329+
{"ipc.attempt", "second"},
330+
{"ipc.attempt.final", "true"},
331+
{"ipc.endpoint", "/get503"},
332+
{"ipc.result", "success"},
333+
{"ipc.status", "success"},
334+
{"nf.process", "spectatord"},
335+
};
306336
auto err_id = Id::Of("ipc.client.call", std::move(err_timer_tags));
307337
auto err_timer = registry.GetTimer(err_id);
308338
EXPECT_EQ(err_timer->Count(), 1);
309339

310-
auto success_timer =
311-
registry.GetTimer("ipc.client.call", std::move(success_timer_tags));
340+
auto success_timer = registry.GetTimer("ipc.client.call", std::move(success_timer_tags));
312341
EXPECT_EQ(success_timer->Count(), 1);
313342
}
314343

@@ -327,19 +356,23 @@ void test_method_header(const std::string& method) {
327356
std::vector<std::string> headers{"X-Spectator: foo", "X-Other-Header: bar"};
328357

329358
auto url = fmt::format("http://localhost:{}/getheader", port);
330-
auto response =
331-
method == "GET" ? client.Get(url, headers) : client.Put(url, headers);
359+
auto response = method == "GET" ? client.Get(url, headers) : client.Put(url, headers);
332360

333361
server.stop();
334362
EXPECT_EQ(response.status, 200);
335363
EXPECT_EQ(response.raw_body, "x-other-header: bar\nx-spectator: foo\n");
336364
EXPECT_EQ(my_timers(registry).size(), 1);
337365

338366
spectator::Tags timer_tags{
339-
{"http.status", "200"}, {"ipc.attempt", "initial"},
340-
{"ipc.result", "success"}, {"owner", "spectatord"},
341-
{"ipc.status", "success"}, {"ipc.attempt.final", "true"},
342-
{"http.method", method}, {"ipc.endpoint", "/getheader"}};
367+
{"http.method", method},
368+
{"http.status", "200"},
369+
{"ipc.attempt", "initial"},
370+
{"ipc.attempt.final", "true"},
371+
{"ipc.endpoint", "/getheader"},
372+
{"ipc.result", "success"},
373+
{"ipc.status", "success"},
374+
{"nf.process", "spectatord"},
375+
};
343376
auto timer_id = Id::Of("ipc.client.call", std::move(timer_tags));
344377
auto timer = registry.GetTimer(timer_id);
345378
EXPECT_EQ(timer->Count(), 1);

spectator/log_entry.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ class LogEntry {
1212
const std::string& url)
1313
: registry_{registry},
1414
start_{absl::Now()},
15-
id_{Id::Of("ipc.client.call", {{"owner", "spectatord"},
15+
id_{Id::Of("ipc.client.call", {{"nf.process", "spectatord"},
1616
{"ipc.endpoint", PathFromUrl(url)},
1717
{"http.method", method},
1818
{"http.status", "-1"}})} {}

spectator/publisher.h

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ namespace detail {
2828
template <typename R>
2929
auto get_counter(R* registry, Tags tags) -> std::shared_ptr<Counter> {
3030
static constexpr auto kSpectatorMeasurements = "spectator.measurements";
31-
tags.add("owner", "spectatord");
31+
tags.add("nf.process", "spectatord");
3232
return registry->GetCounter(kSpectatorMeasurements, std::move(tags));
3333
}
3434
} // namespace detail
@@ -41,7 +41,8 @@ class Publisher {
4141
started_{false},
4242
should_stop_{false},
4343
last_successful_send_{absl::GetCurrentTimeNanos()},
44-
sentMetrics_{detail::get_counter(registry, Tags{{"id", "sent"}})},
44+
sentMetrics_{detail::get_counter(
45+
registry, Tags{{"id", "sent"}})},
4546
invalidMetrics_{detail::get_counter(
4647
registry, Tags{{"id", "dropped"}, {"error", "validation"}})},
4748
droppedHttp_{detail::get_counter(
@@ -279,8 +280,12 @@ class Publisher {
279280
const auto& uri = registry_->GetConfig().uri;
280281
const auto& status_metrics_enabled = registry_->GetConfig().status_metrics_enabled;
281282
auto http_code = http_response.status;
283+
282284
if (http_code == 200) {
283285
num_sent = num_measurements;
286+
if (status_metrics_enabled) {
287+
sentMetrics_->Add(static_cast<double>(num_sent));
288+
}
284289
} else if (http_code > 200 && http_code < 500) {
285290
rapidjson::Document body;
286291
body.Parse(http_response.raw_body.c_str(), http_response.raw_body.size());
@@ -298,7 +303,7 @@ class Publisher {
298303
num_sent = num_measurements - err_count;
299304
if (status_metrics_enabled) {
300305
invalidMetrics_->Add(err_count);
301-
sentMetrics_->Add(static_cast<double>(num_measurements - err_count));
306+
sentMetrics_->Add(static_cast<double>(num_sent));
302307
}
303308
auto messages = body["message"].GetArray();
304309
for (auto& msg : messages) {

spectator/registry.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ Registry::Registry(std::unique_ptr<Config> config,
1212
meter_ttl_{absl::ToInt64Nanoseconds(config->meter_ttl)},
1313
config_{std::move(config)},
1414
logger_{std::move(logger)},
15-
registry_size_{GetDistributionSummary("spectator.registrySize", Tags{{"owner", "spectatord"}})},
15+
registry_size_{GetDistributionSummary("spectator.registrySize", Tags{{"nf.process", "spectatord"}})},
1616
publisher_(this) {
1717
if (meter_ttl_ == 0) {
1818
meter_ttl_ = int64_t(15) * 60 * 1000 * 1000 * 1000;

spectator/registry_test.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,8 @@ TEST(Registry, DistSummary_Size) {
184184
r.GetCounter("bar")->Increment();
185185
// we have 4 measurements from the timer + 1 from the counter
186186
r.Measurements();
187-
EXPECT_DOUBLE_EQ(r.GetDistributionSummary("spectator.registrySize", Tags{{"owner", "spectatord"}})->TotalAmount(), 5.0);
187+
EXPECT_DOUBLE_EQ(r.GetDistributionSummary("spectator.registrySize",
188+
Tags{{"nf.process", "spectatord"}})->TotalAmount(), 5.0);
188189
}
189190

190191
TEST(Registry, DeleteMeters) {

0 commit comments

Comments
 (0)