Skip to content
54 changes: 35 additions & 19 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,21 @@ It collects key metrics about:
| rds_allocated_storage_bytes | `aws_account_id`, `aws_region`, `dbidentifier` | Allocated storage |
| rds_api_call_total | `api`, `aws_account_id`, `aws_region` | Number of call to AWS API |
| rds_backup_retention_period_seconds | `aws_account_id`, `aws_region`, `dbidentifier` | Automatic DB snapshots retention period |
| rds_burst_balance_percent | `"aws_account_id", "aws_region", "dbidentifier"` | Percent of General Purpose SSD (gp2) burst-bucket I/O credits available |
| rds_ca_certificate_valid_until | `aws_account_id`, `aws_region`, `dbidentifier` | Timestamp of the expiration of the Instance certificate |
| rds_checkpoint_lag_seconds | `aws_account_id`, `aws_region`, `dbidentifier` | The amount of time since the most recent checkpoint |
| rds_cpu_credit_balance_average | `aws_account_id`, `aws_region`, `dbidentifier` | Number of CPU credits available for the instance to burst beyond its base CPU utilization |
| rds_cpu_credit_usage_average | `aws_account_id`, `aws_region`, `dbidentifier` | Number of CPU credits consumed by the instance |
| rds_cpu_surplus_credit_balance_average | `aws_account_id`, `aws_region`, `dbidentifier` | Number of surplus CPU credits available for the instance to burst beyond its base CPU utilization |
| rds_cpu_surplus_credits_charged_average | `aws_account_id`, `aws_region`, `dbidentifier` | Number of surplus CPU credits charged when the instance exceeds its base CPU utilization |
| rds_cpu_usage_percent_average | `aws_account_id`, `aws_region`, `dbidentifier` | Instance CPU used |
| rds_database_connections_average | `aws_account_id`, `aws_region`, `dbidentifier` | The number of client network connections to the database instance |
| rds_dbload_average | `aws_account_id`, `aws_region`, `dbidentifier` | Number of active sessions for the DB engine |
| rds_dbload_cpu_average | `aws_account_id`, `aws_region`, `dbidentifier` | Number of active sessions where the wait event type is CPU |
| rds_dbload_noncpu_average | `aws_account_id`, `aws_region`, `dbidentifier` | Number of active sessions where the wait event type is not CPU |
| rds_disk_queue_depth_average | `aws_account_id`, `aws_region`, `dbidentifier` | Number of outstanding IOs (read/write requests) waiting to access the disk |
| rds_ebs_byte_balance_percent | `aws_account_id`, `aws_region`, `dbidentifier` | Percent of burst-bucket bytes available for EBS volumes |
| rds_ebs_iops_balance_percent | `aws_account_id`, `aws_region`, `dbidentifier` | Percent of burst-bucket IOPS available for EBS volumes |
| rds_exporter_build_info | `build_date`, `commit_sha`, `version` | A metric with constant '1' value labeled by version from which exporter was built |
| rds_exporter_errors_total | | Total number of errors encountered by the exporter |
| rds_free_storage_bytes | `aws_account_id`, `aws_region`, `dbidentifier` | Free storage on the instance |
Expand All @@ -71,19 +80,25 @@ It collects key metrics about:
| rds_max_disk_iops_average | `aws_account_id`, `aws_region`, `dbidentifier` | Max disk IOPS evaluated with disk IOPS and EC2 capacity |
| rds_max_storage_throughput_bytes | `aws_account_id`, `aws_region`, `dbidentifier` | Max disk throughput evaluated with disk throughput and EC2 capacity |
| rds_maximum_used_transaction_ids_average | `aws_account_id`, `aws_region`, `dbidentifier` | Maximum transaction IDs that have been used. Applies to only PostgreSQL |
| rds_network_receive_throughput_bytes | `aws_account_id`, `aws_region`, `dbidentifier` | The amount of network throughput received from the client by each instance in bytes per second |
| rds_network_transmit_throughput_bytes | `aws_account_id`, `aws_region`, `dbidentifier` | The amount of network throughput sent to the client by each instance in bytes per second |
| rds_oldest_replication_slot_lag_bytes | `aws_account_id`, `aws_region`, `dbidentifier` | The lag of the oldest replication slot in bytes |
| rds_quota_max_dbinstances_average | `aws_account_id`, `aws_region` | Maximum number of RDS instances allowed in the AWS account |
| rds_quota_maximum_db_instance_snapshots_average | `aws_account_id`, `aws_region` | Maximum number of manual DB instance snapshots |
| rds_quota_total_storage_bytes | `aws_account_id`, `aws_region` | Maximum total storage for all DB instances |
| rds_read_iops_average | `aws_account_id`, `aws_region`, `dbidentifier` | Average number of disk read I/O operations per second |
| rds_read_latency_seconds | `aws_account_id`, `aws_region`, `dbidentifier` | The average amount of time taken per disk I/O operation |
| rds_read_throughput_bytes | `aws_account_id`, `aws_region`, `dbidentifier` | Average number of bytes read from disk per second |
| rds_replica_lag_seconds | `aws_account_id`, `aws_region`, `dbidentifier` | For read replica configurations, the amount of time a read replica DB instance lags behind the source DB instance. Applies to MariaDB, Microsoft SQL Server, MySQL, Oracle, and PostgreSQL read replicas |
| rds_replication_slot_disk_usage_bytes | `aws_account_id`, `aws_region`, `dbidentifier` | Disk space used by replication slot files. Applies to PostgreSQL |
| rds_swap_usage_bytes | `aws_account_id`, `aws_region`, `dbidentifier` | Amount of swap space used on the DB instance. This metric is not available for SQL Server |
| rds_transaction_logs_disk_usage_bytes | `aws_account_id`, `aws_region`, `dbidentifier` | Disk space used by transaction logs (only on PostgreSQL) |
| rds_transaction_logs_generation_bytes | `aws_account_id`, `aws_region`, `dbidentifier` | The amount of transaction logs generated per second |
| rds_usage_allocated_storage_bytes | `aws_account_id`, `aws_region` | Total storage used by AWS RDS instances |
| rds_usage_db_instances_average | `aws_account_id`, `aws_region` | AWS RDS instance count |
| rds_usage_manual_snapshots_average | `aws_account_id`, `aws_region` | Manual snapshots count |
| rds_write_iops_average | `aws_account_id`, `aws_region`, `dbidentifier` | Average number of disk write I/O operations per second |
| rds_write_latency_seconds | `aws_account_id`, `aws_region`, `dbidentifier` | The average amount of time taken per disk I/O operation |
| rds_write_throughput_bytes | `aws_account_id`, `aws_region`, `dbidentifier` | Average number of bytes written to disk per second |
| up | | Was the last scrape of RDS successful |

Expand Down Expand Up @@ -200,25 +215,26 @@ Prometheus RDS exporter</br>

Configuration could be defined in [prometheus-rds-exporter.yaml](https://github.com/qonto/prometheus-rds-exporter/blob/main/configs/prometheus-rds-exporter/prometheus-rds-exporter.yaml) or environment variables (format `PROMETHEUS_RDS_EXPORTER_<PARAMETER_NAME>`).

|Parameter | Description | Default |
| ------------------------ | -------------------------------------------------------------------------------------------------------------------------- | ----------------------- |
| aws-assume-role-arn | AWS IAM ARN role to assume to fetch metrics | |
| aws-assume-role-session | AWS assume role session name | prometheus-rds-exporter |
| collect-instance-metrics | Collect AWS instances metrics (AWS Cloudwatch API) | true |
| collect-instance-tags | Collect AWS RDS tags | true |
| collect-instance-types | Collect AWS instance types information (AWS EC2 API) | true |
| collect-logs-size | Collect AWS instances logs size (AWS RDS API) | true |
| collect-maintenances | Collect AWS instances maintenances (AWS RDS API) | true |
| collect-quotas | Collect AWS RDS quotas (AWS quotas API) | true |
| collect-usages | Collect AWS RDS usages (AWS Cloudwatch API) | true |
| tag-selections | Tags to select database instances with. Refer to [dedicated section on tag configuration](#tag-configuration) | |
| debug | Enable debug mode | |
| enable-otel-traces | Enable OpenTelemetry traces. See [configuration](https://opentelemetry.io/docs/languages/sdk-configuration/otlp-exporter/) | false |
| listen-address | Address to listen on for web interface | :9043 |
| log-format | Log format (`text` or `json`) | json |
| metrics-path | Path under which to expose metrics | /metrics |
| tls-cert-path | Path to TLS certificate | |
| tls-key-path | Path to private key for TLS | |
|Parameter | Description | Default |
| ------------------------ |-----------------------------------------------------------------------------------------------------------------------------| ---------------------- |
| aws-assume-role-arn | AWS IAM ARN role to assume to fetch metrics | |
| aws-assume-role-session | AWS assume role session name | prometheus-rds-exporter |
| collect-instance-metrics | Collect AWS instances metrics (AWS Cloudwatch API) | true |
| collect-instance-metrics-delay" | Delay in collecting instance metrics (in seconds) to avoid CloudWatch API throttling. | 0 |
| collect-instance-tags | Collect AWS RDS tags | true |
| collect-instance-types | Collect AWS instance types information (AWS EC2 API) | true |
| collect-logs-size | Collect AWS instances logs size (AWS RDS API) | true |
| collect-maintenances | Collect AWS instances maintenances (AWS RDS API) | true |
| collect-quotas | Collect AWS RDS quotas (AWS quotas API) | true |
| collect-usages | Collect AWS RDS usages (AWS Cloudwatch API) | true |
| tag-selections | Tags to select database instances with. Refer to [dedicated section on tag configuration](#tag-configuration) | |
| debug | Enable debug mode | |
| enable-otel-traces | Enable OpenTelemetry traces. See [configuration](https://opentelemetry.io/docs/languages/sdk-configuration/otlp-exporter/) | false |
| listen-address | Address to listen on for web interface | :9043 |
| log-format | Log format (`text` or `json`) | json |
| metrics-path | Path under which to expose metrics | /metrics |
| tls-cert-path | Path to TLS certificate | |
| tls-key-path | Path to private key for TLS | |

Configuration parameters priorities:

Expand Down
53 changes: 28 additions & 25 deletions cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,23 +39,24 @@ var (
)

type exporterConfig struct {
Debug bool `koanf:"debug"`
LogFormat string `koanf:"log-format"`
TLSCertPath string `koanf:"tls-cert-path"`
TLSKeyPath string `koanf:"tls-key-path"`
MetricPath string `koanf:"metrics-path"`
ListenAddress string `koanf:"listen-address"`
AWSAssumeRoleSession string `koanf:"aws-assume-role-session"`
AWSAssumeRoleArn string `koanf:"aws-assume-role-arn"`
CollectInstanceMetrics bool `koanf:"collect-instance-metrics"`
CollectInstanceTags bool `koanf:"collect-instance-tags"`
CollectInstanceTypes bool `koanf:"collect-instance-types"`
CollectLogsSize bool `koanf:"collect-logs-size"`
CollectMaintenances bool `koanf:"collect-maintenances"`
CollectQuotas bool `koanf:"collect-quotas"`
CollectUsages bool `koanf:"collect-usages"`
OTELTracesEnabled bool `koanf:"enable-otel-traces"`
TagSelections map[string][]string `koanf:"tag-selections"`
Debug bool `koanf:"debug"`
LogFormat string `koanf:"log-format"`
TLSCertPath string `koanf:"tls-cert-path"`
TLSKeyPath string `koanf:"tls-key-path"`
MetricPath string `koanf:"metrics-path"`
ListenAddress string `koanf:"listen-address"`
AWSAssumeRoleSession string `koanf:"aws-assume-role-session"`
AWSAssumeRoleArn string `koanf:"aws-assume-role-arn"`
CollectInstanceMetrics bool `koanf:"collect-instance-metrics"`
CollectInstanceMetricsDelay int `koanf:"collect-instance-metrics-delay"`
CollectInstanceTags bool `koanf:"collect-instance-tags"`
CollectInstanceTypes bool `koanf:"collect-instance-types"`
CollectLogsSize bool `koanf:"collect-logs-size"`
CollectMaintenances bool `koanf:"collect-maintenances"`
CollectQuotas bool `koanf:"collect-quotas"`
CollectUsages bool `koanf:"collect-usages"`
OTELTracesEnabled bool `koanf:"enable-otel-traces"`
TagSelections map[string][]string `koanf:"tag-selections"`
}

func run(configuration exporterConfig) {
Expand Down Expand Up @@ -92,14 +93,15 @@ func run(configuration exporterConfig) {
servicequotasClient := servicequotas.NewFromConfig(cfg)

collectorConfiguration := exporter.Configuration{
CollectInstanceMetrics: configuration.CollectInstanceMetrics,
CollectInstanceTypes: configuration.CollectInstanceTypes,
CollectInstanceTags: configuration.CollectInstanceTags,
CollectLogsSize: configuration.CollectLogsSize,
CollectMaintenances: configuration.CollectMaintenances,
CollectQuotas: configuration.CollectQuotas,
CollectUsages: configuration.CollectUsages,
TagSelections: configuration.TagSelections,
CollectInstanceMetrics: configuration.CollectInstanceMetrics,
CollectInstanceMetricsDelay: configuration.CollectInstanceMetricsDelay,
CollectInstanceTypes: configuration.CollectInstanceTypes,
CollectInstanceTags: configuration.CollectInstanceTags,
CollectLogsSize: configuration.CollectLogsSize,
CollectMaintenances: configuration.CollectMaintenances,
CollectQuotas: configuration.CollectQuotas,
CollectUsages: configuration.CollectUsages,
TagSelections: configuration.TagSelections,
}

collector := exporter.NewCollector(*logger, collectorConfiguration, awsAccountID, awsRegion, rdsClient, ec2Client, cloudWatchClient, servicequotasClient, tagClient)
Expand Down Expand Up @@ -163,6 +165,7 @@ func NewRootCommand() (*cobra.Command, error) {
cmd.Flags().BoolP("collect-instance-tags", "", true, "Collect AWS RDS tags")
cmd.Flags().BoolP("collect-instance-types", "", true, "Collect AWS instance types")
cmd.Flags().BoolP("collect-instance-metrics", "", true, "Collect AWS instance metrics")
cmd.Flags().IntP("collect-instance-metrics-delay", "", 0, "Delay in seconds before collecting instance metrics")
cmd.Flags().BoolP("collect-logs-size", "", true, "Collect AWS instances logs size")
cmd.Flags().BoolP("collect-maintenances", "", true, "Collect AWS instances maintenances")
cmd.Flags().BoolP("collect-quotas", "", true, "Collect AWS RDS quotas")
Expand Down
3 changes: 3 additions & 0 deletions configs/prometheus-rds-exporter/prometheus-rds-exporter.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@
# Collect AWS instances metrics (AWS Cloudwatch API)
# collect-instance-metrics: true

# Delay in seconds before collecting instance metrics (default: 0)
# collect-instance-metrics-delay: 60

# Collect AWS instance tags (AWS RDS API)
# collect-instance-tags: true

Expand Down
Loading