-
Notifications
You must be signed in to change notification settings - Fork 25.6k
Add more metrics for reindex #137597
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Add more metrics for reindex #137597
Changes from all commits
f11a9c7
290fa48
6566a55
70ed46d
978f34c
84afac9
26abb9d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,25 +9,74 @@ | |
|
|
||
| package org.elasticsearch.reindex; | ||
|
|
||
| import org.elasticsearch.ElasticsearchStatusException; | ||
| import org.elasticsearch.telemetry.metric.LongHistogram; | ||
| import org.elasticsearch.telemetry.metric.MeterRegistry; | ||
|
|
||
| import java.util.HashMap; | ||
| import java.util.Map; | ||
|
|
||
| public class ReindexMetrics { | ||
|
|
||
| public static final String REINDEX_TIME_HISTOGRAM = "es.reindex.duration.histogram"; | ||
| public static final String REINDEX_COMPLETION_HISTOGRAM = "es.reindex.completion.histogram"; | ||
|
|
||
| // refers to https://opentelemetry.io/docs/specs/semconv/registry/attributes/error/#error-type | ||
| public static final String ATTRIBUTE_NAME_ERROR_TYPE = "error.type"; | ||
|
|
||
| public static final String ATTRIBUTE_NAME_SOURCE = "reindex.source"; | ||
| public static final String ATTRIBUTE_VALUE_SOURCE_LOCAL = "local"; | ||
| public static final String ATTRIBUTE_VALUE_SOURCE_REMOTE = "remote"; | ||
|
|
||
| private final LongHistogram reindexTimeSecsHistogram; | ||
| private final LongHistogram reindexCompletionHistogram; | ||
|
|
||
| public ReindexMetrics(MeterRegistry meterRegistry) { | ||
| this(meterRegistry.registerLongHistogram(REINDEX_TIME_HISTOGRAM, "Time to reindex by search", "seconds")); | ||
| this.reindexTimeSecsHistogram = meterRegistry.registerLongHistogram(REINDEX_TIME_HISTOGRAM, "Time to reindex by search", "seconds"); | ||
| this.reindexCompletionHistogram = meterRegistry.registerLongHistogram( | ||
| REINDEX_COMPLETION_HISTOGRAM, | ||
| "Number of completed reindex operations", | ||
| "unit" | ||
| ); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't this be a
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I thought of using Counter, my understanding is that Counter accumulates over time, which is good if we only want to track the the total amount at any given time. Whereas using histogram has the advantage of aggregating over arbitrary period of time.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, a counter accumulates over time. In my experience, you then use it by calculating a rate over some period of time e.g. 15mins. That's considered best practice when using Prometheus, for example. I don't know whether the same thing works in Kibana. It relies on it being cheap and easy to compute rates. We should maybe ask around for advice — either within the team or with some subject expert.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess the other option is to ditch this metric and just have a single histogram metric for the durations with the error attribute as well as the source attribute? That would mean combining the listeners, I haven't checked whether that's doable.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tried to find some existing Counter metrics in Kibana, and found the emitted values are not accumulating, which makes me wonder if my understanding on Counter is actually correct. I have put the details in this thread, hopefully someone can help answer there.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks @samxbr. |
||
| } | ||
|
|
||
| private ReindexMetrics(LongHistogram reindexTimeSecsHistogram) { | ||
| this.reindexTimeSecsHistogram = reindexTimeSecsHistogram; | ||
| } | ||
| public long recordTookTime(long tookTime, boolean remote) { | ||
| Map<String, Object> attributes = getAttributes(remote); | ||
|
|
||
| public long recordTookTime(long tookTime) { | ||
| reindexTimeSecsHistogram.record(tookTime); | ||
| reindexTimeSecsHistogram.record(tookTime, attributes); | ||
| return tookTime; | ||
| } | ||
|
|
||
| public void recordSuccess(boolean remote) { | ||
| Map<String, Object> attributes = getAttributes(remote); | ||
| // attribute ATTRIBUTE_ERROR_TYPE being absent indicates success | ||
| assert attributes.get(ATTRIBUTE_NAME_ERROR_TYPE) == null : "error.type attribute must not be present for successes"; | ||
|
|
||
| reindexCompletionHistogram.record(1L, attributes); | ||
| } | ||
|
|
||
| public void recordFailure(boolean remote, Throwable e) { | ||
| Map<String, Object> attributes = getAttributes(remote); | ||
| // best effort to extract useful error type if possible | ||
| String errorType; | ||
| if (e instanceof ElasticsearchStatusException ese) { | ||
| errorType = ese.status().name(); | ||
| } else { | ||
| errorType = e.getClass().getTypeName(); | ||
| } | ||
| attributes.put(ATTRIBUTE_NAME_ERROR_TYPE, errorType); | ||
|
|
||
| // attribute ATTRIBUTE_ERROR_TYPE being present indicates failure | ||
| // https://opentelemetry.io/docs/specs/semconv/general/recording-errors/#recording-errors-on-metrics | ||
| assert attributes.get(ATTRIBUTE_NAME_ERROR_TYPE) != null : "error.type attribute must be present for failures"; | ||
|
|
||
| reindexCompletionHistogram.record(1L, attributes); | ||
| } | ||
|
|
||
| private Map<String, Object> getAttributes(boolean remote) { | ||
| Map<String, Object> attributes = new HashMap<>(); | ||
| attributes.put(ATTRIBUTE_NAME_SOURCE, remote ? ATTRIBUTE_VALUE_SOURCE_REMOTE : ATTRIBUTE_VALUE_SOURCE_LOCAL); | ||
|
|
||
| return attributes; | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.