Prometheus-载入配置

基于v3.5

启动顺序

sequenceDiagram
    participant main.go
    participant Config
    participant GlobalConfig
    participant ScrapeConfig
    participant RemoteWriteConfig
    participant RuntimeConfig
    participant Storage
    
    main.go->>Config: 加载配置文件(Load/LoadFile)
    Config->>GlobalConfig: 初始化全局配置
    Config->>ScrapeConfig: 初始化抓取配置
    Config->>RemoteWriteConfig: 初始化远程写入配置
    Config->>RuntimeConfig: 初始化运行时配置
    
    main.go->>Storage: 初始化存储(TSDB/WAL)
    
    loop 配置验证
        Config->>GlobalConfig: 验证外部标签等
        Config->>ScrapeConfig: 验证抓取配置
        Config->>RemoteWriteConfig: 验证远程写入配置
    end
    
    main.go->>Config: 标记配置已加载(loaded=true)
    
    Note right of Config: 配置加载完成
    
    main.go->>ScrapeConfig: 启动抓取管理器
    main.go->>RemoteWriteConfig: 启动远程写入
    main.go->>RuntimeConfig: 应用运行时参数(GOMAXPROCS等)
    
    Note over main.go,Storage: 启动核心组件

详细流程

配置项目的定义

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
// Config is the top-level configuration for Prometheus's config files.
// Config 是Prometheus配置文件的顶级配置结构
type Config struct {
GlobalConfig GlobalConfig `yaml:"global"` // 全局配置项
Runtime RuntimeConfig `yaml:"runtime,omitempty"` // 运行时配置项,可选
AlertingConfig AlertingConfig `yaml:"alerting,omitempty"` // 告警相关配置,可选
RuleFiles []string `yaml:"rule_files,omitempty"` // 规则文件路径列表,可选
ScrapeConfigFiles []string `yaml:"scrape_config_files,omitempty"` // 抓取配置文件路径列表,可选
ScrapeConfigs []*ScrapeConfig `yaml:"scrape_configs,omitempty"` // 抓取配置列表,可选
StorageConfig StorageConfig `yaml:"storage,omitempty"` // 存储配置,可选
TracingConfig TracingConfig `yaml:"tracing,omitempty"` // 追踪配置,可选

RemoteWriteConfigs []*RemoteWriteConfig `yaml:"remote_write,omitempty"` // 远程写入配置列表,可选
RemoteReadConfigs []*RemoteReadConfig `yaml:"remote_read,omitempty"` // 远程读取配置列表,可选
OTLPConfig OTLPConfig `yaml:"otlp,omitempty"` // OTLP配置,可选

loaded bool // Certain methods require configuration to use Load validation.
// 标记配置是否已加载,某些方法需要验证配置是否已加载
}

// GlobalConfig configures values that are used across other configuration
// objects.
// GlobalConfig 配置跨其他配置对象使用的值
type GlobalConfig struct {
// How frequently to scrape targets by default.
// 默认抓取目标的频率
ScrapeInterval model.Duration `yaml:"scrape_interval,omitempty"`
// The default timeout when scraping targets.
// 抓取目标的默认超时时间
ScrapeTimeout model.Duration `yaml:"scrape_timeout,omitempty"`
// The protocols to negotiate during a scrape. It tells clients what
// protocol are accepted by Prometheus and with what weight (most wanted is first).
// Supported values (case sensitive): PrometheusProto, OpenMetricsText0.0.1,
// OpenMetricsText1.0.0, PrometheusText0.0.4.
// 抓取期间协商的协议列表,按优先级排序
ScrapeProtocols []ScrapeProtocol `yaml:"scrape_protocols,omitempty"`
// How frequently to evaluate rules by default.
// 默认规则评估频率
EvaluationInterval model.Duration `yaml:"evaluation_interval,omitempty"`
// Offset the rule evaluation timestamp of this particular group by the specified duration into the past to ensure the underlying metrics have been received.
// 规则评估时间戳偏移量,确保底层指标已接收
RuleQueryOffset model.Duration `yaml:"rule_query_offset,omitempty"`
// File to which PromQL queries are logged.
// PromQL查询日志文件路径
QueryLogFile string `yaml:"query_log_file,omitempty"`
// File to which scrape failures are logged.
// 抓取失败日志文件路径
ScrapeFailureLogFile string `yaml:"scrape_failure_log_file,omitempty"`
// The labels to add to any timeseries that this Prometheus instance scrapes.
// 添加到所有时间序列的外部标签
ExternalLabels labels.Labels `yaml:"external_labels,omitempty"`
// An uncompressed response body larger than this many bytes will cause the
// scrape to fail. 0 means no limit.
// 未压缩响应体大小限制(字节),0表示无限制
BodySizeLimit units.Base2Bytes `yaml:"body_size_limit,omitempty"`
// More than this many samples post metric-relabeling will cause the scrape to
// fail. 0 means no limit.
// 指标重标记后的样本数量限制,0表示无限制
SampleLimit uint `yaml:"sample_limit,omitempty"`
// More than this many targets after the target relabeling will cause the
// scrapes to fail. 0 means no limit.
// 目标重标记后的目标数量限制,0表示无限制
TargetLimit uint `yaml:"target_limit,omitempty"`
// More than this many labels post metric-relabeling will cause the scrape to
// fail. 0 means no limit.
// 指标重标记后的标签数量限制,0表示无限制
LabelLimit uint `yaml:"label_limit,omitempty"`
// More than this label name length post metric-relabeling will cause the
// scrape to fail. 0 means no limit.
// 指标重标记后的标签名称长度限制,0表示无限制
LabelNameLengthLimit uint `yaml:"label_name_length_limit,omitempty"`
// More than this label value length post metric-relabeling will cause the
// scrape to fail. 0 means no limit.
// 指标重标记后的标签值长度限制,0表示无限制
LabelValueLengthLimit uint `yaml:"label_value_length_limit,omitempty"`
// Keep no more than this many dropped targets per job.
// 0 means no limit.
// 每个作业保留的丢弃目标数量限制,0表示无限制
KeepDroppedTargets uint `yaml:"keep_dropped_targets,omitempty"`
// Allow UTF8 Metric and Label Names. Can be blank in config files but must
// have a value if a ScrapeConfig is created programmatically.
// 是否允许UTF8指标和标签名称
MetricNameValidationScheme string `yaml:"metric_name_validation_scheme,omitempty"`
// Metric name escaping mode to request through content negotiation. Can be
// blank in config files but must have a value if a ScrapeConfig is created
// programmatically.
// 指标名称转义模式
MetricNameEscapingScheme string `yaml:"metric_name_escaping_scheme,omitempty"`
// Whether to convert all scraped classic histograms into native histograms with custom buckets.
// 是否将所有经典直方图转换为自定义桶的原生直方图
ConvertClassicHistogramsToNHCB bool `yaml:"convert_classic_histograms_to_nhcb,omitempty"`
// Whether to scrape a classic histogram, even if it is also exposed as a native histogram.
// 是否抓取经典直方图,即使它也作为原生直方图暴露
AlwaysScrapeClassicHistograms bool `yaml:"always_scrape_classic_histograms,omitempty"`
}

// RuntimeConfig configures the values for the process behavior.
// RuntimeConfig 配置进程行为的运行时参数
type RuntimeConfig struct {
// The Go garbage collection target percentage.
// Go垃圾回收目标百分比(GOGC)
GoGC int `yaml:"gogc,omitempty"`

// Below are guidelines for adding a new field:
// 以下是添加新字段的指南:
//
// For config that shouldn't change after startup, you might want to use
// flags https://prometheus.io/docs/prometheus/latest/command-line/prometheus/.
// 对于启动后不应更改的配置,建议使用命令行参数
//
// Consider when the new field is first applied: at the very beginning of instance
// startup, after the TSDB is loaded etc. See https://github.com/prometheus/prometheus/pull/16491
// for an example.
// 考虑新字段何时首次应用:实例启动时、TSDB加载后等
//
// Provide a test covering various scenarios: empty config file, empty or incomplete runtime
// config block, precedence over other inputs (e.g., env vars, if applicable) etc.
// See TestRuntimeGOGCConfig (or https://github.com/prometheus/prometheus/pull/15238).
// 提供测试用例覆盖各种场景:空配置文件、不完整的运行时配置块、与其他输入的优先级等
//
// The test should also verify behavior on reloads, since this config should be
// adjustable at runtime.
// 测试还应验证重新加载时的行为,因为此配置应在运行时可调整
}

// ScrapeConfig configures a scraping unit for Prometheus.
// ScrapeConfig 配置Prometheus的一个抓取单元
type ScrapeConfig struct {
// The job name to which the job label is set by default.
// 作业名称,默认会设置为job标签的值
JobName string `yaml:"job_name"`
// Indicator whether the scraped metrics should remain unmodified.
// 是否保持抓取的指标不变(不进行重命名)
HonorLabels bool `yaml:"honor_labels,omitempty"`
// Indicator whether the scraped timestamps should be respected.
// 是否尊重抓取的时间戳
HonorTimestamps bool `yaml:"honor_timestamps"`
// Indicator whether to track the staleness of the scraped timestamps.
// 是否跟踪抓取时间戳的陈旧性
TrackTimestampsStaleness bool `yaml:"track_timestamps_staleness"`
// A set of query parameters with which the target is scraped.
// 抓取目标时使用的查询参数集合
Params url.Values `yaml:"params,omitempty"`
// How frequently to scrape the targets of this scrape config.
// 抓取目标的频率
ScrapeInterval model.Duration `yaml:"scrape_interval,omitempty"`
// The timeout for scraping targets of this config.
// 抓取目标的超时时间
ScrapeTimeout model.Duration `yaml:"scrape_timeout,omitempty"`
// The protocols to negotiate during a scrape. It tells clients what
// protocol are accepted by Prometheus and with what preference (most wanted is first).
// Supported values (case sensitive): PrometheusProto, OpenMetricsText0.0.1,
// OpenMetricsText1.0.0, PrometheusText1.0.0, PrometheusText0.0.4.
// 抓取期间协商的协议列表,按优先级排序
ScrapeProtocols []ScrapeProtocol `yaml:"scrape_protocols,omitempty"`
// The fallback protocol to use if the Content-Type provided by the target
// is not provided, blank, or not one of the expected values.
// 当目标未提供Content-Type或不符合预期时的回退协议
ScrapeFallbackProtocol ScrapeProtocol `yaml:"fallback_scrape_protocol,omitempty"`
// Whether to scrape a classic histogram, even if it is also exposed as a native histogram.
// 是否抓取经典直方图,即使它也作为原生直方图暴露
AlwaysScrapeClassicHistograms *bool `yaml:"always_scrape_classic_histograms,omitempty"`
// Whether to convert all scraped classic histograms into a native histogram with custom buckets.
// 是否将所有经典直方图转换为自定义桶的原生直方图
ConvertClassicHistogramsToNHCB *bool `yaml:"convert_classic_histograms_to_nhcb,omitempty"`
// File to which scrape failures are logged.
// 抓取失败日志文件路径
ScrapeFailureLogFile string `yaml:"scrape_failure_log_file,omitempty"`
// The HTTP resource path on which to fetch metrics from targets.
// 从目标获取指标的HTTP资源路径
MetricsPath string `yaml:"metrics_path,omitempty"`
// The URL scheme with which to fetch metrics from targets.
// 从目标获取指标的URL方案(http/https)
Scheme string `yaml:"scheme,omitempty"`
// Indicator whether to request compressed response from the target.
// 是否请求目标返回压缩响应
EnableCompression bool `yaml:"enable_compression"`
// An uncompressed response body larger than this many bytes will cause the
// scrape to fail. 0 means no limit.
// 未压缩响应体大小限制(字节),0表示无限制
BodySizeLimit units.Base2Bytes `yaml:"body_size_limit,omitempty"`
// More than this many samples post metric-relabeling will cause the scrape to
// fail. 0 means no limit.
// 指标重标记后的样本数量限制,0表示无限制
SampleLimit uint `yaml:"sample_limit,omitempty"`
// More than this many targets after the target relabeling will cause the
// scrapes to fail. 0 means no limit.
// 目标重标记后的目标数量限制,0表示无限制
TargetLimit uint `yaml:"target_limit,omitempty"`
// More than this many labels post metric-relabeling will cause the scrape to
// fail. 0 means no limit.
// 指标重标记后的标签数量限制,0表示无限制
LabelLimit uint `yaml:"label_limit,omitempty"`
// More than this label name length post metric-relabeling will cause the
// scrape to fail. 0 means no limit.
// 指标重标记后的标签名称长度限制,0表示无限制
LabelNameLengthLimit uint `yaml:"label_name_length_limit,omitempty"`
// More than this label value length post metric-relabeling will cause the
// scrape to fail. 0 means no limit.
// 指标重标记后的标签值长度限制,0表示无限制
LabelValueLengthLimit uint `yaml:"label_value_length_limit,omitempty"`
// If there are more than this many buckets in a native histogram,
// buckets will be merged to stay within the limit.
// 原生直方图的桶数量限制,超过会合并桶
NativeHistogramBucketLimit uint `yaml:"native_histogram_bucket_limit,omitempty"`
// If the growth factor of one bucket to the next is smaller than this,
// buckets will be merged to increase the factor sufficiently.
// 原生直方图桶增长因子最小值,小于此值会合并桶
NativeHistogramMinBucketFactor float64 `yaml:"native_histogram_min_bucket_factor,omitempty"`
// Keep no more than this many dropped targets per job.
// 每个作业保留的丢弃目标数量限制,0表示无限制
KeepDroppedTargets uint `yaml:"keep_dropped_targets,omitempty"`
// Allow UTF8 Metric and Label Names. Can be blank in config files but must
// have a value if a ScrapeConfig is created programmatically.
// 是否允许UTF8指标和标签名称
MetricNameValidationScheme string `yaml:"metric_name_validation_scheme,omitempty"`
// Metric name escaping mode to request through content negotiation. Can be
// blank in config files but must have a value if a ScrapeConfig is created
// programmatically.
// 指标名称转义模式
MetricNameEscapingScheme string `yaml:"metric_name_escaping_scheme,omitempty"`

// We cannot do proper Go type embedding below as the parser will then parse
// values arbitrarily into the overflow maps of further-down types.

// 服务发现配置
ServiceDiscoveryConfigs discovery.Configs `yaml:"-"`
// HTTP客户端配置
HTTPClientConfig config.HTTPClientConfig `yaml:",inline"`

// List of target relabel configurations.
// 目标重标记配置列表
RelabelConfigs []*relabel.Config `yaml:"relabel_configs,omitempty"`
// List of metric relabel configurations.
// 指标重标记配置列表
MetricRelabelConfigs []*relabel.Config `yaml:"metric_relabel_configs,omitempty"`
}

加载配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
// 检查配置文件有效性,提前报错避免后续组件启动。
// Throw error for invalid config before starting other components.
var cfgFile *config.Config
if cfgFile, err = config.LoadFile(cfg.configFile, agentMode, promslog.NewNopLogger()); err != nil {
absPath, pathErr := filepath.Abs(cfg.configFile)
if pathErr != nil {
absPath = cfg.configFile
}
logger.Error(fmt.Sprintf("Error loading config (--config.file=%s)", cfg.configFile), "file", absPath, "err", err)
os.Exit(2)
}

// 调用者不应修改或浅拷贝返回的Config对象
func LoadFile(filename string, agentMode bool, logger *slog.Logger) (*Config, error) {
// 读取配置文件内容
content, err := os.ReadFile(filename)
if err != nil {
return nil, err
}

// 调用Load函数解析YAML内容
cfg, err := Load(string(content), logger)
if err != nil {
// 返回包含文件名的错误信息
return nil, fmt.Errorf("parsing YAML file %s: %w", filename, err)
}

// 在agent模式下进行额外验证
if agentMode {
// 检查alerting相关配置是否存在于agent模式中
if len(cfg.AlertingConfig.AlertmanagerConfigs) > 0 || len(cfg.AlertingConfig.AlertRelabelConfigs) > 0 {
return nil, errors.New("field alerting is not allowed in agent mode")
}

// 检查rule_files配置是否存在于agent模式中
if len(cfg.RuleFiles) > 0 {
return nil, errors.New("field rule_files is not allowed in agent mode")
}

// 检查remote_read配置是否存在于agent模式中
if len(cfg.RemoteReadConfigs) > 0 {
return nil, errors.New("field remote_read is not allowed in agent mode")
}
}

// 设置配置文件的基准目录
cfg.SetDirectory(filepath.Dir(filename))
return cfg, nil
}

// Load parses the YAML input s into a Config.
// Load 将YAML格式的输入字符串s解析为Config对象
func Load(s string, logger *slog.Logger) (*Config, error) {
cfg := &Config{}
// If the entire config body is empty the UnmarshalYAML method is
// never called. We thus have to set the DefaultConfig at the entry
// point as well.
// 如果配置体为空,UnmarshalYAML方法不会被调用,因此需要在此处设置默认配置
*cfg = DefaultConfig

// 使用严格模式解析YAML配置
err := yaml.UnmarshalStrict([]byte(s), cfg)
if err != nil {
return nil, err
}

// 处理外部标签中的环境变量替换
b := labels.NewScratchBuilder(0)
cfg.GlobalConfig.ExternalLabels.Range(func(v labels.Label) {
// 替换标签值中的环境变量
newV := os.Expand(v.Value, func(s string) string {
if s == "$" {
return "$" // 处理转义的$符号
}
if v := os.Getenv(s); v != "" {
return v // 返回环境变量值
}
logger.Warn("Empty environment variable", "name", s)
return "" // 环境变量为空时返回空字符串
})
if newV != v.Value {
logger.Debug("External label replaced", "label", v.Name, "input", v.Value, "output", newV)
}
// Note newV can be blank. https://github.com/prometheus/prometheus/issues/11024
b.Add(v.Name, newV) // 将处理后的标签添加到构建器
})
if !b.Labels().IsEmpty() {
cfg.GlobalConfig.ExternalLabels = b.Labels() // 更新外部标签
}

// 验证OTLP翻译策略
switch cfg.OTLPConfig.TranslationStrategy {
case UnderscoreEscapingWithSuffixes:
case "":
case NoTranslation, NoUTF8EscapingWithSuffixes:
if cfg.GlobalConfig.MetricNameValidationScheme == LegacyValidationConfig {
return nil, fmt.Errorf("OTLP translation strategy %q is not allowed when UTF8 is disabled", cfg.OTLPConfig.TranslationStrategy)
}
default:
return nil, fmt.Errorf("unsupported OTLP translation strategy %q", cfg.OTLPConfig.TranslationStrategy)
}
cfg.loaded = true // 标记配置已加载
return cfg, nil
}

scrape 配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
// 获取 scrape 配置,验证动态加载的 scrape_config_files。
// Get scrape configs to validate dynamically loaded scrape_config_files.
// They can change over time, but do the extra validation on startup for better experience.
if _, err := cfgFile.GetScrapeConfigs(); err != nil {
absPath, pathErr := filepath.Abs(cfg.configFile)
if pathErr != nil {
absPath = cfg.configFile
}
logger.Error(fmt.Sprintf("Error loading dynamic scrape config files from config (--config.file=%q)", cfg.configFile), "file", absPath, "err", err)
os.Exit(2)
}
if cfg.tsdb.EnableExemplarStorage {
if cfgFile.StorageConfig.ExemplarsConfig == nil {
cfgFile.StorageConfig.ExemplarsConfig = &config.DefaultExemplarsConfig
}
cfg.tsdb.MaxExemplars = cfgFile.StorageConfig.ExemplarsConfig.MaxExemplars
}
if cfgFile.StorageConfig.TSDBConfig != nil {
cfg.tsdb.OutOfOrderTimeWindow = cfgFile.StorageConfig.TSDBConfig.OutOfOrderTimeWindow
}

// GetScrapeConfigs returns the read-only, validated scrape configurations including
// the ones from the scrape_config_files.
// This method does not write to config, and it's concurrency safe (the pointer receiver is for efficiency).
// This method also assumes the Config was created by Load or LoadFile function, it returns error
// if it was not. We can't re-validate or apply globals here due to races,
// read more https://github.com/prometheus/prometheus/issues/15538.
// GetScrapeConfigs 返回只读的、经过验证的抓取配置,包括来自scrape_config_files的配置
// 该方法不会修改配置,且是并发安全的(使用指针接收器是为了效率)
// 该方法假设Config是由Load或LoadFile函数创建的,如果不是则返回错误
// 由于竞态条件,这里不能重新验证或应用全局配置
func (c *Config) GetScrapeConfigs() ([]*ScrapeConfig, error) {
// 检查配置是否已加载
if !c.loaded {
// 程序错误,在因缺少全局配置导致更混乱的错误发生前发出警告
return nil, errors.New("scrape config cannot be fetched, main config was not validated and loaded correctly; should not happen")
}

// 初始化抓取配置切片和作业名称映射
scfgs := make([]*ScrapeConfig, len(c.ScrapeConfigs))
jobNames := map[string]string{}

// 收集主配置文件中的抓取配置和作业名称
for i, scfg := range c.ScrapeConfigs {
jobNames[scfg.JobName] = "main config file"
scfgs[i] = scfg
}

// 重新读取并验证动态抓取配置规则
for _, pat := range c.ScrapeConfigFiles {
// 使用glob模式匹配文件
fs, err := filepath.Glob(pat)
if err != nil {
// 唯一可能的错误是模式错误
return nil, fmt.Errorf("error retrieving scrape config files for %q: %w", pat, err)
}

// 处理每个匹配的文件
for _, filename := range fs {
cfg := ScrapeConfigs{}
// 读取文件内容
content, err := os.ReadFile(filename)
if err != nil {
return nil, fileErr(filename, err)
}
// 解析YAML内容
err = yaml.UnmarshalStrict(content, &cfg)
if err != nil {
return nil, fileErr(filename, err)
}

// 处理文件中的每个抓取配置
for _, scfg := range cfg.ScrapeConfigs {
// 验证配置
if err := scfg.Validate(c.GlobalConfig); err != nil {
return nil, fileErr(filename, err)
}

// 检查作业名称是否重复
if f, ok := jobNames[scfg.JobName]; ok {
return nil, fileErr(filename, fmt.Errorf("found multiple scrape configs with job name %q, first found in %s", scfg.JobName, f))
}
jobNames[scfg.JobName] = fmt.Sprintf("%q", filePath(filename))

// 设置配置文件的基准目录
scfg.SetDirectory(filepath.Dir(filename))
scfgs = append(scfgs, scfg)
}
}
}
return scfgs, nil
}