Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[wip] prometheus monitoring support #26837

Closed
wants to merge 23 commits into from
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions corehq/util/datadog/apps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from django.apps import AppConfig
from django.conf import settings


class DatadogConfig(AppConfig):

name = 'corehq.util.datadog'
verbose_name = 'Datadog'

def ready(self):
if not settings.DATADOG_API_KEY or not settings.DATADOG_APP_KEY:
return

try:
from datadog import initialize
except ImportError:
pass
else:
initialize(settings.DATADOG_API_KEY, settings.DATADOG_APP_KEY)

if settings.UNIT_TESTING or settings.DEBUG or 'ddtrace.contrib.django' not in settings.INSTALLED_APPS:
try:
from ddtrace import tracer
tracer.enabled = False
except ImportError:
pass

19 changes: 19 additions & 0 deletions corehq/util/metrics/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from corehq.util.metrics.datadog import DatadogMetrics
from corehq.util.metrics.metrics import DummyMetrics, DelegatedMetrics, HqMetrics
from corehq.util.metrics.prometheus import PrometheusMetrics

_metrics = None # singleton/global


def get_metrics() -> HqMetrics:
global _metrics
if not _metrics:
enabled = list(filter(lambda m: m.enabled(), [DatadogMetrics(), PrometheusMetrics()]))
if not enabled:
_metrics = DummyMetrics()

if len(enabled) > 1:
_metrics = DelegatedMetrics(enabled)

_metrics = enabled[0]
return _metrics
49 changes: 49 additions & 0 deletions corehq/util/metrics/datadog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import logging

from django.conf import settings

from corehq.util.metrics.metrics import HqCounter, HqGauge, HqMetrics
from datadog import api
from datadog.dogstatsd.base import DogStatsd

datadog_logger = logging.getLogger('datadog')

COMMON_TAGS = ['environment:{}'.format(settings.SERVER_ENVIRONMENT)]

statsd = DogStatsd(constant_tags=COMMON_TAGS)


def _format_tags(tag_names, tag_values):
if not tag_names or not tag_values:
return None

return [
f'{name}:{value}' for name, value in zip(tag_names, tag_values)
]


def _datadog_record(fn, name, value, tags=None):
try:
fn(name, value, tags=tags)
except Exception:
datadog_logger.exception('Unable to record Datadog stats')


class Counter(HqCounter):
def _inc(self, amount=1):
tags = _format_tags(self.tag_names, self.tag_values)
_datadog_record(statsd.increment, self.name, amount, tags)


class Gauge(HqGauge):
def _set(self, value):
tags = _format_tags(self.tag_names, self.tag_values)
_datadog_record(statsd.gauge, self.name, value, tags)


class DatadogMetrics(HqMetrics):
_counter_class = Counter
_gauge_class = Gauge

def enabled(self) -> bool:
return api._api_key and api._application_key
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this doesn't look like it will always return bool (it will probably return a string or None). If you want a bool you could do

return bool(api._api_key and api._application_key)

But that doesn't seem very Pythonic. I'd probably drop the type hint.

However, I'm not very familiar with the conventions around type hints. Does -> bool imply that returned object can be treated like a bool (most objects support that), or that the returned value is always True or False? This is one reason I'm not a fan of type hints in Python.

139 changes: 139 additions & 0 deletions corehq/util/metrics/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import abc
import re
from abc import abstractmethod
from typing import Iterable

from corehq.util.soft_assert import soft_assert

METRIC_NAME_RE = re.compile(r'^[a-zA-Z_:.][a-zA-Z0-9_:.]*$')
METRIC_TAG_NAME_RE = re.compile(r'^[a-zA-Z_][a-zA-Z0-9_]*$')
RESERVED_METRIC_TAG_NAME_RE = re.compile(r'^__.*$')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this be simplified?

Suggested change
RESERVED_METRIC_TAG_NAME_RE = re.compile(r'^__.*$')
RESERVED_METRIC_TAG_NAME_RE = re.compile(r'^__')

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True, this was copied from the Prometheus client.

RESERVED_METRIC_TAG_NAMES = ['quantile', 'le']


def _enforce_prefix(name, prefix):
soft_assert(fail_if_debug=True).call(
not prefix or name.startswith(prefix),
"Did you mean to call your metric 'commcare.{}'? ".format(name))


def _validate_tag_names(tag_names):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is required by Prometheus.

tag_names = tuple(tag_names)
for l in tag_names:
if not METRIC_TAG_NAME_RE.match(l):
raise ValueError('Invalid metric tag name: ' + l)
if RESERVED_METRIC_TAG_NAME_RE.match(l):
raise ValueError('Reserved metric tag name: ' + l)
if l in RESERVED_METRIC_TAG_NAMES:
raise ValueError('Reserved metric tag name: ' + l)
return tag_names


class MetricBase:
def __init__(self, name: str, documentation: str, tag_names: Iterable=tuple(), tag_values=None):
self.name = name
if not METRIC_NAME_RE.match(name):
raise ValueError('Invalid metric name: ' + name)
_enforce_prefix(name, 'commcare')
self.documentation = documentation
self.tag_names = _validate_tag_names(tag_names)
self.tag_values = tag_values

def tag(self, **tag_kwargs):
if sorted(tag_kwargs) != sorted(self.tag_names):
raise ValueError('Incorrect tag names')

tag_values = tuple(str(tag_kwargs[t]) for t in self.tag_names)
return self._get_tagged_instance(tag_values)

def _get_tagged_instance(self, tag_values):
return self.__class__(self.name, self.documentation, self.tag_names, tag_values)

def _validate_tags(self):
if self.tag_names and not self.tag_values:
raise Exception('Metric has missing tag values.')


class HqCounter(MetricBase):
def inc(self, amount: float = 1):
"""Increment the counter by the given amount."""
self._validate_tags()
self._inc(amount)

def _inc(self, amount: float = 1):
"""Override this method to record the metric"""
pass


class HqGauge(MetricBase):
def set(self, value: float):
"""Set gauge to the given value."""
self._validate_tags()
self._set(value)

def _set(self, value: float):
"""Override this method to record the metric"""
pass


class HqMetrics(metaclass=abc.ABCMeta):
_counter_class = None
_gauge_class = None

@abstractmethod
def enabled(self) -> bool:
raise NotImplementedError

def counter(self, name: str, documentation: str, tag_names: Iterable=tuple()) -> HqCounter:
return self._counter_class(name, documentation, tag_names)

def gauge(self, name: str, documentation: str, tag_names: Iterable=tuple()) -> HqGauge:
return self._gauge_class(name, documentation, tag_names)


class DummyMetrics(HqMetrics):
_counter_class = HqCounter
_gauge_class = HqGauge

def enabled(self) -> bool:
return True


class DelegatedMetrics(HqMetrics):
def __init__(self, delegates):
self.delegates = delegates

def enabled(self) -> bool:
return True

def counter(self, name: str, documentation: str, tag_names: Iterable=tuple()):
return DelegatingCounter([
d.counter(name, documentation, tag_names) for d in self.delegates
])

def gauge(self, name: str, documentation: str, tag_names: Iterable=tuple()):
return DelegatingGauge([
d.gauge(name, documentation, tag_names) for d in self.delegates
])


class DelegatingMetric:
def __init__(self, delegates):
self._delegates = delegates

def tag(self, **tag_kwargs):
return self.__class__([
d.tag(**tag_kwargs) for d in self._delegates
])


class DelegatingCounter(DelegatingMetric):
def inc(self, amount: float = 1):
for d in self._delegates:
d.inc(amount)


class DelegatingGauge(DelegatingMetric):
def set(self, value: float):
for d in self._delegates:
d.set(value)
46 changes: 46 additions & 0 deletions corehq/util/metrics/prometheus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from typing import Iterable

import settings
from corehq.util.metrics.metrics import (

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

F401 'corehq.util.metrics.metrics.MetricBase' imported but unused

HqCounter,
HqGauge,
HqMetrics,
MetricBase,
)
from prometheus_client import Counter as PCounter
from prometheus_client import Gauge as PGauge


class PrometheusMetricBase(MetricBase):
_metric_class = None

def __init__(self, name: str, documentation: str, tag_names: Iterable=tuple(), tag_values=None, delegate=None):
name = name.replace('.', '_')
super().__init__(name, documentation, tag_names, tag_values)
self._delegate = delegate or self._metric_class(name, documentation, tag_names, labelvalues=tag_values)

def _get_tagged_instance(self, tag_values):
delegate = self._delegate.labels(dict(zip(self.tag_names, tag_values)))
return self.__class__(self.name, self.documentation, self.tag_names, self.tag_values, delegate)


class Counter(PrometheusMetricBase, HqCounter):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oof, multiple inheritance with diamonds... I don't love this.

I'd rather have simple functional call patterns like our existing datadog_counter(..., tags=...) (made generic so as not to have "datadog" in the name) rather than changing all of our metrics calls to use a new metrics system with extensive use of inheritance.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the inheritance isn't great here. The alternative is a bit more duplicate code but not a lot. I'll see what I can do to make it better if we stay on this course.

If we want to keep it functional then we would need to have a centralized metrics singleton that would always return the same metric class when called (in the case of Pometheus). It would also mean that every call to the metric would need to pass in all required args (which is the case now anyway):

metrics_histogram(value, tags=..., buckets=..., unit=..., dd_bucket_tag=...)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've resolved this by removing the need to create new copies of the class when applying tags:

- metric.tag(group='a').inc()
+ metric.inc(group='a')

_metric_class = PCounter

def _inc(self, amount=1):
self._delegate.inc(amount)


class Gauge(PrometheusMetricBase, HqGauge):
_metric_class = PGauge

def _set(self, value: float):
self._delegate.set(value)


class PrometheusMetrics(HqMetrics):
_counter_class = Counter
_gauge_class = Gauge

def enabled(self) -> bool:
return settings.ENABLE_PROMETHEUS_METRICS
15 changes: 2 additions & 13 deletions settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,7 @@
'corehq.motech.openmrs',
'corehq.motech.repeaters',
'corehq.util',
'corehq.util.datadog.apps.DatadogConfig',
'dimagi.ext',
'corehq.blobs',
'corehq.apps.case_search',
Expand Down Expand Up @@ -837,6 +838,7 @@

DATADOG_API_KEY = None
DATADOG_APP_KEY = None
ENABLE_PROMETHEUS_METRICS = False

SYNCLOGS_SQL_DB_ALIAS = 'default'

Expand Down Expand Up @@ -2073,19 +2075,6 @@
if 'dummy' not in CACHES:
CACHES['dummy'] = {'BACKEND': 'django.core.cache.backends.dummy.DummyCache'}

try:
from datadog import initialize
except ImportError:
pass
else:
initialize(DATADOG_API_KEY, DATADOG_APP_KEY)

if UNIT_TESTING or DEBUG or 'ddtrace.contrib.django' not in INSTALLED_APPS:
try:
from ddtrace import tracer
tracer.enabled = False
except ImportError:
pass

REST_FRAMEWORK = {
'DATETIME_FORMAT': '%Y-%m-%dT%H:%M:%S.%fZ',
Expand Down