K8s工作负载-DaemonSet

基于1.25

什么是DaemonSet

DaemonSet 控制器确保所有(或一部分)的节点都运行了一个指定的 Pod 副本,缩写DS

使用场景

DaemonSetSpec

// DaemonSetSpec is the specification of a daemon set.
type DaemonSetSpec struct {
// A label query over pods that are managed by the daemon set.
// Must match in order to be controlled.
// It must match the pod template's labels.
// More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors
// 标签选择器:必须和PodTemplate品牌
Selector *metav1.LabelSelector `json:"selector" protobuf:"bytes,1,opt,name=selector"`

// An object that describes the pod that will be created.
// The DaemonSet will create exactly one copy of this pod on every node
// that matches the template's node selector (or on every node if no node
// selector is specified).
// The only allowed template.spec.restartPolicy value is "Always".
// More info: https://kubernetes.io/docs/concepts/workloads/controllers/replicationcontroller#pod-template
// Pod的控制器模版
Template v1.PodTemplateSpec `json:"template" protobuf:"bytes,2,opt,name=template"`

// An update strategy to replace existing DaemonSet pods with new pods.
// +optional
// Pod更新策略
UpdateStrategy DaemonSetUpdateStrategy `json:"updateStrategy,omitempty" protobuf:"bytes,3,opt,name=updateStrategy"`

// The minimum number of seconds for which a newly created DaemonSet pod should
// be ready without any of its container crashing, for it to be considered
// available. Defaults to 0 (pod will be considered available as soon as it
// is ready).
// +optional
// Pod最小就绪准备时间
MinReadySeconds int32 `json:"minReadySeconds,omitempty" protobuf:"varint,4,opt,name=minReadySeconds"`

// The number of old history to retain to allow rollback.
// This is a pointer to distinguish between explicit zero and not specified.
// Defaults to 10.
// +optional
// 允许回滚的保留最大记录数 默认10
RevisionHistoryLimit *int32 `json:"revisionHistoryLimit,omitempty" protobuf:"varint,6,opt,name=revisionHistoryLimit"`
}

更新策略

包含俩种更新策略:RollingUpdate、OnDelete

// DaemonSetUpdateStrategy is a struct used to control the update strategy for a DaemonSet.
type DaemonSetUpdateStrategy struct {
// Type of daemon set update. Can be "RollingUpdate" or "OnDelete". Default is RollingUpdate.
// +optional
Type DaemonSetUpdateStrategyType `json:"type,omitempty" protobuf:"bytes,1,opt,name=type"`

// Rolling update config params. Present only if type = "RollingUpdate".
//---
// TODO: Update this to follow our convention for oneOf, whatever we decide it
// to be. Same as Deployment `strategy.rollingUpdate`.
// See https://github.com/kubernetes/kubernetes/issues/35345
// +optional
RollingUpdate *RollingUpdateDaemonSet `json:"rollingUpdate,omitempty" protobuf:"bytes,2,opt,name=rollingUpdate"`
}

// +enum
type DaemonSetUpdateStrategyType string

const (
// Replace the old daemons by new ones using rolling update i.e replace them on each node one after the other.
RollingUpdateDaemonSetStrategyType DaemonSetUpdateStrategyType = "RollingUpdate"

// Replace the old daemons only when it's killed
OnDeleteDaemonSetStrategyType DaemonSetUpdateStrategyType = "OnDelete"
)


// Spec to control the desired behavior of daemon set rolling update.
type RollingUpdateDaemonSet struct {
// The maximum number of DaemonSet pods that can be unavailable during the
// update. Value can be an absolute number (ex: 5) or a percentage of total
// number of DaemonSet pods at the start of the update (ex: 10%). Absolute
// number is calculated from percentage by rounding up.
// This cannot be 0 if MaxSurge is 0
// Default value is 1.
// Example: when this is set to 30%, at most 30% of the total number of nodes
// that should be running the daemon pod (i.e. status.desiredNumberScheduled)
// can have their pods stopped for an update at any given time. The update
// starts by stopping at most 30% of those DaemonSet pods and then brings
// up new DaemonSet pods in their place. Once the new pods are available,
// it then proceeds onto other DaemonSet pods, thus ensuring that at least
// 70% of original number of DaemonSet pods are available at all times during
// the update.
// +optional
// 更新期间不可用的DS Pod最大数量
MaxUnavailable *intstr.IntOrString `json:"maxUnavailable,omitempty" protobuf:"bytes,1,opt,name=maxUnavailable"`

// The maximum number of nodes with an existing available DaemonSet pod that
// can have an updated DaemonSet pod during during an update.
// Value can be an absolute number (ex: 5) or a percentage of desired pods (ex: 10%).
// This can not be 0 if MaxUnavailable is 0.
// Absolute number is calculated from percentage by rounding up to a minimum of 1.
// Default value is 0.
// Example: when this is set to 30%, at most 30% of the total number of nodes
// that should be running the daemon pod (i.e. status.desiredNumberScheduled)
// can have their a new pod created before the old pod is marked as deleted.
// The update starts by launching new pods on 30% of nodes. Once an updated
// pod is available (Ready for at least minReadySeconds) the old DaemonSet pod
// on that node is marked deleted. If the old pod becomes unavailable for any
// reason (Ready transitions to false, is evicted, or is drained) an updated
// pod is immediatedly created on that node without considering surge limits.
// Allowing surge implies the possibility that the resources consumed by the
// daemonset on any given node can double if the readiness check fails, and
// so resource intensive daemonsets should take into account that they may
// cause evictions during disruption.
// +optional
// 对比可用DS Pod,更新期间的DS Pod最大数量
MaxSurge *intstr.IntOrString `json:"maxSurge,omitempty" protobuf:"bytes,2,opt,name=maxSurge"`
}

节点同步

主要是通过syncNodes func 实现

  • 创建Template信息并为节点设置容忍
  • 慢启动以指数曾长速度创建缺少的Pod(这逻辑同RS)
    • 设置Pod节点亲和,确保Pod在节点上只有一个实例
    • 调用CreatePods func创建Pod
  • 调用Delete Pod删除多余的Pod

污点和容忍

容忍会自动添加到DS的Pod中

容忍 影响 描述
node.kubernetes.io/not-ready NoExcute 节点出现问题,DS容器组不会从节点上被驱逐
node.kubernetes.io/unreachable NoExcute 节点出现问题,DS容器组不会从节点上被驱逐
node.kubernetes.io/disk-pressure NoSchedule 节点有磁盘压力,DS容器组不会从节点上驱逐
node.kubernetes.io/memory-pressure NoSchedule 节点有内存压力,DS容器组不会从节点上驱逐
node.kubernetes.io/unschedulable NoSchedule 默认调度器,对于DS容器组,容忍节点的unschedulable属性
node.kubernetes.io/network-unavailable NoSchedule 使用默认的调度器,对于DS容器组,在其使用host network时,容忍节点的network-unavailable属性
node.kubernetes.io/pid-pressure NoSchedule 当节点有PID压力,DS容器组不会容器上驱逐
// AddOrUpdateDaemonPodTolerations apply necessary tolerations to DaemonSet Pods, e.g. node.kubernetes.io/not-ready:NoExecute.
func AddOrUpdateDaemonPodTolerations(spec *v1.PodSpec) {
// DaemonSet pods shouldn't be deleted by NodeController in case of node problems.
// Add infinite toleration for taint notReady:NoExecute here
// to survive taint-based eviction enforced by NodeController
// when node turns not ready.
v1helper.AddOrUpdateTolerationInPodSpec(spec, &v1.Toleration{
Key: v1.TaintNodeNotReady,
Operator: v1.TolerationOpExists,
Effect: v1.TaintEffectNoExecute,
})

// DaemonSet pods shouldn't be deleted by NodeController in case of node problems.
// Add infinite toleration for taint unreachable:NoExecute here
// to survive taint-based eviction enforced by NodeController
// when node turns unreachable.
v1helper.AddOrUpdateTolerationInPodSpec(spec, &v1.Toleration{
Key: v1.TaintNodeUnreachable,
Operator: v1.TolerationOpExists,
Effect: v1.TaintEffectNoExecute,
})

// According to TaintNodesByCondition feature, all DaemonSet pods should tolerate
// MemoryPressure, DiskPressure, PIDPressure, Unschedulable and NetworkUnavailable taints.
v1helper.AddOrUpdateTolerationInPodSpec(spec, &v1.Toleration{
Key: v1.TaintNodeDiskPressure,
Operator: v1.TolerationOpExists,
Effect: v1.TaintEffectNoSchedule,
})

v1helper.AddOrUpdateTolerationInPodSpec(spec, &v1.Toleration{
Key: v1.TaintNodeMemoryPressure,
Operator: v1.TolerationOpExists,
Effect: v1.TaintEffectNoSchedule,
})

v1helper.AddOrUpdateTolerationInPodSpec(spec, &v1.Toleration{
Key: v1.TaintNodePIDPressure,
Operator: v1.TolerationOpExists,
Effect: v1.TaintEffectNoSchedule,
})

v1helper.AddOrUpdateTolerationInPodSpec(spec, &v1.Toleration{
Key: v1.TaintNodeUnschedulable,
Operator: v1.TolerationOpExists,
Effect: v1.TaintEffectNoSchedule,
})

if spec.HostNetwork {
v1helper.AddOrUpdateTolerationInPodSpec(spec, &v1.Toleration{
Key: v1.TaintNodeNetworkUnavailable,
Operator: v1.TolerationOpExists,
Effect: v1.TaintEffectNoSchedule,
})
}
}

设置调度亲和性

通过ReplaceDaemonSetPodNodeNameNodeAffiity,把Pod的nodeAffinity设置为硬限制(requiredDuringSchedulingIgnoredDuringExecution)

nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- nodeSlectorTerms:
matchExpressions:
- key: kubernetes.io/hostname
operator: in
values:
- <hostname>