K8s核心资源对象-Pod(资源配额与cgroup)

基于1.25

Pod的资源配额

Pod主要通过requetsts和limits设置配额:

  • requests.cpu=250m后,实际上是把cgroupcpu.shares的值设置了(250/1000)*1024,默认值是1024

  • limits.cpu=500m,实际上把cgroupcpu.cfs_quota_us的值设置为(500/1000)*100ms,而cpu.cfs_period_us始终是100ms

  • limits.memory=128Mi,相当于cgroupmemory.limit_in_bytes设置为128 * 1024 * 1024,调度时,只会使用requests.memory=64Mi判断

  • 如果值设置了limits,没指定requests,没有应用准入时机制,requests的值同样会设置limits

生成cgroup资源的流程如下:

  1. 初始化Pause容器和其他容器都用调用calculateLinux Resource Func来计算最终资源
  2. 生成Linux的容器资源配额LinuxContainer Resource

LinuxContainer Resource字段

  • CpuPeriod:容器CPU时间长度,单位是微秒

  • CpuQuota:容器可以使用的CPU的时间量,单位微秒

    • 不设置CpuQuota,容器可以允许使用全部时间的CPU时间
    • CpuQuota小于CpuPeriod,则容器CPU资源将设置为CpuQuota/CpuPeriod比例
  • CpuShares:容器的CPU资源共享时间。CpuShares表示对CPU时间的相对访问权,而不是实际的时间片数。所有容器的CPU时间如果已经分配完,则CpuShares越大,获得CPU时间越多

  • MermoryLimtitInBytes:对容器内存使用的硬限制,单位为Bytes。容器无法使用超过此限制,超过容器困难强制退出

  • OomScoreAdj:设置内核Out-Of-Memory killer规则的一种机制,在内存资源紧张,终止系统的一些进程,释放更多内存资源

    • 通过调试此值,可以提供容器被警告和终止的风险
    • 默认0.无限制
  • CpusetCpus:容器能使用的CPU ID列表。”0-2,5,7“ 容器可以使用0,1,2 ,5,7 号CPU

  • CpusetMems:容器能使用的内存节点列表

  • HugepageLimits:容器对Hugepage的大小个数量限制

  • Unified:Linux cgroup v2参数,键值对表示参数名和值

  • MemeorySwapLimitInBytes:对容器使用swap的硬限制,单位为字节

    • 不设置,默认2倍MermoryLimtitInBytes
  • Ref:https://github.com/kubernetes/cri-api/blob/2c8d015e0d408208ca8843c1d6e2e2fce1e5dd94/pkg/apis/runtime/v1/api.pb.go#L3256

// LinuxContainerResources specifies Linux specific configuration for
// resources.
type LinuxContainerResources struct {
// CPU CFS (Completely Fair Scheduler) period. Default: 0 (not specified).
CpuPeriod int64 `protobuf:"varint,1,opt,name=cpu_period,json=cpuPeriod,proto3" json:"cpu_period,omitempty"`
// CPU CFS (Completely Fair Scheduler) quota. Default: 0 (not specified).
CpuQuota int64 `protobuf:"varint,2,opt,name=cpu_quota,json=cpuQuota,proto3" json:"cpu_quota,omitempty"`
// CPU shares (relative weight vs. other containers). Default: 0 (not specified).
CpuShares int64 `protobuf:"varint,3,opt,name=cpu_shares,json=cpuShares,proto3" json:"cpu_shares,omitempty"`
// Memory limit in bytes. Default: 0 (not specified).
MemoryLimitInBytes int64 `protobuf:"varint,4,opt,name=memory_limit_in_bytes,json=memoryLimitInBytes,proto3" json:"memory_limit_in_bytes,omitempty"`
// OOMScoreAdj adjusts the oom-killer score. Default: 0 (not specified).
OomScoreAdj int64 `protobuf:"varint,5,opt,name=oom_score_adj,json=oomScoreAdj,proto3" json:"oom_score_adj,omitempty"`
// CpusetCpus constrains the allowed set of logical CPUs. Default: "" (not specified).
CpusetCpus string `protobuf:"bytes,6,opt,name=cpuset_cpus,json=cpusetCpus,proto3" json:"cpuset_cpus,omitempty"`
// CpusetMems constrains the allowed set of memory nodes. Default: "" (not specified).
CpusetMems string `protobuf:"bytes,7,opt,name=cpuset_mems,json=cpusetMems,proto3" json:"cpuset_mems,omitempty"`
// List of HugepageLimits to limit the HugeTLB usage of container per page size. Default: nil (not specified).
HugepageLimits []*HugepageLimit `protobuf:"bytes,8,rep,name=hugepage_limits,json=hugepageLimits,proto3" json:"hugepage_limits,omitempty"`
// Unified resources for cgroup v2. Default: nil (not specified).
// Each key/value in the map refers to the cgroup v2.
// e.g. "memory.max": "6937202688" or "io.weight": "default 100".
Unified map[string]string `protobuf:"bytes,9,rep,name=unified,proto3" json:"unified,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"`
// Memory swap limit in bytes. Default 0 (not specified).
MemorySwapLimitInBytes int64 `protobuf:"varint,10,opt,name=memory_swap_limit_in_bytes,json=memorySwapLimitInBytes,proto3" json:"memory_swap_limit_in_bytes,omitempty"`
// 主要是Protocol Buffers 使用
XXX_NoUnkeyedLiteral struct{} `json:"-"`
// Protocol 内部字段,提高序列化性能
XXX_sizecache int32 `json:"-"`
}

计算资源配额

主要通过applySandboxResources func实现

  1. 计算Init容器和普通容器配额
  2. 计算Overhead指定的资源
  3. 初始化通用容器的配置

计算cgroup资源限制

calculateLinuxResources 会把用户指定的参数转换为 Linux参数

// calculateLinuxResources will create the linuxContainerResources type based on the provided CPU and memory resource requests, limits
func (m *kubeGenericRuntimeManager) calculateLinuxResources(cpuRequest, cpuLimit, memoryLimit *resource.Quantity) *runtimeapi.LinuxContainerResources {
resources := runtimeapi.LinuxContainerResources{}
var cpuShares int64

memLimit := memoryLimit.Value()

// If request is not specified, but limit is, we want request to default to limit.
// API server does this for new containers, but we repeat this logic in Kubelet
// for containers running on existing Kubernetes clusters.
// 如果没有设置requests,但是设置了limits,就要设置为request的值
if cpuRequest.IsZero() && !cpuLimit.IsZero() {
cpuShares = int64(cm.MilliCPUToShares(cpuLimit.MilliValue()))
} else {
// if cpuRequest.Amount is nil, then MilliCPUToShares will return the minimal number
// of CPU shares.
cpuShares = int64(cm.MilliCPUToShares(cpuRequest.MilliValue()))
}
// 设置cpuShares
resources.CpuShares = cpuShares
if memLimit != 0 {
resources.MemoryLimitInBytes = memLimit
}

// 如果设置cpu-cfs-quota参数为true
if m.cpuCFSQuota {
// if cpuLimit.Amount is nil, then the appropriate default value is returned
// to allow full usage of cpu resource.
// 默认为100ms
cpuPeriod := int64(quotaPeriod)
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUCFSQuotaPeriod) {
cpuPeriod = int64(m.cpuCFSQuotaPeriod.Duration / time.Microsecond)
}
cpuQuota := milliCPUToQuota(cpuLimit.MilliValue(), cpuPeriod)
resources.CpuQuota = cpuQuota
resources.CpuPeriod = cpuPeriod
}

return &resources
}

调度器计算资源的公式

// resourceRequest = max(sum(podSpec.Containers), podSpec.InitContainers) + overHead
func calculateResource(pod *v1.Pod) (res Resource, non0CPU int64, non0Mem int64) {
resPtr := &res
for _, c := range pod.Spec.Containers {
resPtr.Add(c.Resources.Requests)
non0CPUReq, non0MemReq := schedutil.GetNonzeroRequests(&c.Resources.Requests)
non0CPU += non0CPUReq
non0Mem += non0MemReq
// No non-zero resources for GPUs or opaque resources.
}
// max(sum(podSpec.Containers),podSpec.InitContainers)+overhead
for _, ic := range pod.Spec.InitContainers {
resPtr.SetMaxResource(ic.Resources.Requests)
non0CPUReq, non0MemReq := schedutil.GetNonzeroRequests(&ic.Resources.Requests)
non0CPU = max(non0CPU, non0CPUReq)
non0Mem = max(non0Mem, non0MemReq)
}

// If Overhead is being utilized, add to the total requests for the pod
if pod.Spec.Overhead != nil {
resPtr.Add(pod.Spec.Overhead)
if _, found := pod.Spec.Overhead[v1.ResourceCPU]; found {
non0CPU += pod.Spec.Overhead.Cpu().MilliValue()
}

if _, found := pod.Spec.Overhead[v1.ResourceMemory]; found {
non0Mem += pod.Spec.Overhead.Memory().Value()
}
}

return
}

使用Describe命令查看资源数据的计算方法

// PodRequestsAndLimits returns a dictionary of all defined resources summed up for all
// containers of the pod. If pod overhead is non-nil, the pod overhead is added to the
// total container resource requests and to the total container limits which have a
// non-zero quantity.
func PodRequestsAndLimits(pod *corev1.Pod) (reqs, limits corev1.ResourceList) {
reqs, limits = corev1.ResourceList{}, corev1.ResourceList{}
for _, container := range pod.Spec.Containers {
addResourceList(reqs, container.Resources.Requests)
addResourceList(limits, container.Resources.Limits)
}
// init containers define the minimum of any resource
for _, container := range pod.Spec.InitContainers {
maxResourceList(reqs, container.Resources.Requests)
maxResourceList(limits, container.Resources.Limits)
}

// Add overhead for running a pod to the sum of requests and to non-zero limits:
if pod.Spec.Overhead != nil {
addResourceList(reqs, pod.Spec.Overhead)

for name, quantity := range pod.Spec.Overhead {
if value, ok := limits[name]; ok && !value.IsZero() {
value.Add(quantity)
limits[name] = value
}
}
}
return
}