Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions clickhouse/assets/configuration/spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,51 @@ files:
value:
type: boolean
example: false
- name: query_errors
description: Configure collection of query errors from system.query_log
options:
- name: enabled
description: |
Enable collection of query errors. Requires `dbm: true`.
Collects ExceptionBeforeStart and ExceptionWhileProcessing events, which include
exception message, error code, and stack trace.
value:
type: boolean
example: true
- name: collection_interval
description: |
Set the query errors collection interval (in seconds).
value:
type: number
example: 10
- name: samples_per_hour_per_query
description: |
Set the maximum number of error samples to collect per hour per unique query signature.
Errors are high-signal events, so this defaults higher than query_completions.
value:
type: number
example: 60
- name: seen_samples_cache_maxsize
hidden: true
description: |
Set the max size of the cache used for rate limiting error samples.
value:
type: number
default: 10000
- name: max_samples_per_collection
hidden: true
description: |
Maximum number of error samples to collect in a single run (applies LIMIT in SQL).
value:
type: number
default: 1000
- name: run_sync
hidden: true
description: |
Run the query errors collection synchronously. For testing only.
value:
type: boolean
example: false
- template: instances/db
overrides:
custom_queries.value.example:
Expand Down
1 change: 1 addition & 0 deletions clickhouse/changelog.d/23041.added
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add query error collection from system.query_log for DBM
15 changes: 15 additions & 0 deletions clickhouse/datadog_checks/clickhouse/clickhouse.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from .config import build_config, sanitize
from .health import ClickhouseHealth, HealthEvent, HealthStatus
from .query_completions import ClickhouseQueryCompletions
from .query_errors import ClickhouseQueryErrors
from .statement_samples import ClickhouseStatementSamples
from .statements import ClickhouseStatementMetrics
from .utils import ErrorSanitizer
Expand Down Expand Up @@ -120,6 +121,12 @@ def _init_dbm_components(self):
else:
self.query_completions = None

# Initialize query errors (from system.query_log - failed queries)
if self._config.dbm and self._config.query_errors.enabled:
self.query_errors = ClickhouseQueryErrors(self, self._config.query_errors)
else:
self.query_errors = None

@property
def tags(self) -> list[str]:
"""Return the current list of tags from the TagManager."""
Expand Down Expand Up @@ -244,6 +251,10 @@ def check(self, _):
if self.query_completions:
self.query_completions.run_job_loop(self.tags)

# Run query errors if DBM is enabled (from system.query_log - failed queries)
if self.query_errors:
self.query_errors.run_job_loop(self.tags)

@AgentCheck.metadata_entrypoint
def collect_version(self):
version = list(self.execute_query_raw('SELECT version()'))[0][0]
Expand Down Expand Up @@ -461,6 +472,8 @@ def cancel(self):
self.statement_samples.cancel()
if self.query_completions:
self.query_completions.cancel()
if self.query_errors:
self.query_errors.cancel()

# Wait for job loops to finish
if self.statement_metrics and self.statement_metrics._job_loop_future:
Expand All @@ -469,6 +482,8 @@ def cancel(self):
self.statement_samples._job_loop_future.result()
if self.query_completions and self.query_completions._job_loop_future:
self.query_completions._job_loop_future.result()
if self.query_errors and self.query_errors._job_loop_future:
self.query_errors._job_loop_future.result()

# Close main client
if self._client:
Expand Down
17 changes: 17 additions & 0 deletions clickhouse/datadog_checks/clickhouse/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,10 @@ def build_config(check: ClickhouseCheck) -> Tuple[InstanceConfig, ValidationResu
**dict_defaults.instance_query_completions().model_dump(),
**(instance.get('query_completions', {})),
},
"query_errors": {
**dict_defaults.instance_query_errors().model_dump(),
**(instance.get('query_errors', {})),
},
# Tags - ensure we have a list, not None
"tags": list(instance.get('tags', [])),
# Other settings
Expand Down Expand Up @@ -188,6 +192,13 @@ def _apply_validated_defaults(args: dict, instance: dict, validation_result: Val
f"query_completions.collection_interval must be greater than 0, defaulting to {default_value} seconds."
)

if _safefloat(args.get('query_errors', {}).get('collection_interval')) <= 0:
default_value = dict_defaults.instance_query_errors().collection_interval
args['query_errors']['collection_interval'] = default_value
validation_result.add_warning(
f"query_errors.collection_interval must be greater than 0, defaulting to {default_value} seconds."
)


def _validate_config(config: InstanceConfig, instance: dict, validation_result: ValidationResult):
"""Validate the configuration and add warnings/errors."""
Expand All @@ -203,6 +214,7 @@ def _validate_config(config: InstanceConfig, instance: dict, validation_result:
'query_completions',
config.query_completions.enabled if config.query_completions else False,
),
('query_errors', config.query_errors.enabled if config.query_errors else False),
]
for feature_name, _is_enabled in dbm_features:
if instance.get(feature_name, {}).get('enabled') and not config.dbm:
Expand Down Expand Up @@ -234,6 +246,11 @@ def _apply_features(config: InstanceConfig, validation_result: ValidationResult)
config.query_completions.enabled and config.dbm,
None if config.dbm else "Requires `dbm: true`",
)
validation_result.add_feature(
FeatureKey.QUERY_ERRORS,
config.query_errors.enabled and config.dbm,
None if config.dbm else "Requires `dbm: true`",
)
validation_result.add_feature(FeatureKey.SINGLE_ENDPOINT_MODE, config.single_endpoint_mode)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,14 @@ def instance_query_completions():
max_samples_per_collection=1000,
run_sync=False,
)


def instance_query_errors():
return instance.QueryErrors(
enabled=True,
collection_interval=10,
samples_per_hour_per_query=60,
seen_samples_cache_maxsize=10000,
max_samples_per_collection=1000,
run_sync=False,
)
14 changes: 14 additions & 0 deletions clickhouse/datadog_checks/clickhouse/config_models/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,19 @@ class QueryCompletions(BaseModel):
seen_samples_cache_maxsize: Optional[float] = None


class QueryErrors(BaseModel):
model_config = ConfigDict(
arbitrary_types_allowed=True,
frozen=True,
)
collection_interval: Optional[float] = None
enabled: Optional[bool] = None
max_samples_per_collection: Optional[float] = None
run_sync: Optional[bool] = None
samples_per_hour_per_query: Optional[float] = None
seen_samples_cache_maxsize: Optional[float] = None


class QueryMetrics(BaseModel):
model_config = ConfigDict(
arbitrary_types_allowed=True,
Expand Down Expand Up @@ -106,6 +119,7 @@ class InstanceConfig(BaseModel):
password: Optional[str] = None
port: Optional[int] = None
query_completions: Optional[QueryCompletions] = None
query_errors: Optional[QueryErrors] = None
query_metrics: Optional[QueryMetrics] = None
query_samples: Optional[QuerySamples] = None
read_timeout: Optional[int] = None
Expand Down
22 changes: 22 additions & 0 deletions clickhouse/datadog_checks/clickhouse/data/conf.yaml.example
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,28 @@ instances:
#
# samples_per_hour_per_query: 15

## Configure collection of query errors from system.query_log
#
# query_errors:

## @param enabled - boolean - optional - default: true
## Enable collection of query errors. Requires `dbm: true`.
## Collects ExceptionBeforeStart and ExceptionWhileProcessing events, which include
## exception message, error code, and stack trace.
#
# enabled: true

## @param collection_interval - number - optional - default: 10
## Set the query errors collection interval (in seconds).
#
# collection_interval: 10

## @param samples_per_hour_per_query - number - optional - default: 60
## Set the maximum number of error samples to collect per hour per unique query signature.
## Errors are high-signal events, so this defaults higher than query_completions.
#
# samples_per_hour_per_query: 60

## @param only_custom_queries - boolean - optional - default: false
## Set this parameter to `true` if you want to skip the integration's default metrics collection.
## Only metrics specified in `custom_queries` will be collected.
Expand Down
2 changes: 2 additions & 0 deletions clickhouse/datadog_checks/clickhouse/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class FeatureKey(Enum):
QUERY_METRICS = "query_metrics"
QUERY_SAMPLES = "query_samples"
QUERY_COMPLETIONS = "query_completions"
QUERY_ERRORS = "query_errors"
SINGLE_ENDPOINT_MODE = "single_endpoint_mode"


Expand All @@ -29,6 +30,7 @@ class FeatureKey(Enum):
FeatureKey.QUERY_METRICS: 'Query Metrics',
FeatureKey.QUERY_SAMPLES: 'Query Samples',
FeatureKey.QUERY_COMPLETIONS: 'Query Completions',
FeatureKey.QUERY_ERRORS: 'Query Errors',
FeatureKey.SINGLE_ENDPOINT_MODE: 'Single Endpoint Mode',
}

Expand Down
6 changes: 3 additions & 3 deletions clickhouse/datadog_checks/clickhouse/query_completions.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,15 +115,15 @@ def _collect_and_submit(self):
# Step 3: Submit payload
payload_data = json.dumps(payload, default=default_json_event_encoding)
num_completions = len(payload.get('clickhouse_query_completions', []))
self._log.info(
self._log.debug(
"Submitting query completions payload: %d bytes, %d completions",
len(payload_data),
num_completions,
)
self._check.database_monitoring_query_activity(payload_data)

if self._current_checkpoint_microseconds is not None:
self._log.info(
self._log.debug(
"Successfully submitted. Checkpoint: %d microseconds", self._current_checkpoint_microseconds
)

Expand Down Expand Up @@ -162,7 +162,7 @@ def _collect_completed_queries(self):

rows = self._execute_query(query, parameters=params)

self._log.info(
self._log.debug(
"Loaded %d completed queries from %s [%s]",
len(rows),
query_log_table,
Expand Down
Loading
Loading