K8s工作负载-Deployment

基于1.25

什么是Deployment

Deployment 最常见就是部署无状服务,缩写Deploy

  • Deployment实现声明的方法控制Pod和ReplicaSet
  • 支持滚定升级和回滚应用
  • 支持扩容和缩容
  • 暂停和继续Deployment

DeploymentSpec

// DeploymentSpec is the specification of the desired behavior of the Deployment.
type DeploymentSpec struct {
// Number of desired pods. This is a pointer to distinguish between explicit
// zero and not specified. Defaults to 1.
// +optional
// 期望的Pod数量,默认1
Replicas *int32 `json:"replicas,omitempty" protobuf:"varint,1,opt,name=replicas"`

// Label selector for pods. Existing ReplicaSets whose pods are
// selected by this will be the ones affected by this deployment.
// It must match the pod template's labels.
// 标签选择器,必须和PodTemplate的标签匹配
Selector *metav1.LabelSelector `json:"selector" protobuf:"bytes,2,opt,name=selector"`

// Template describes the pods that will be created.
// The only allowed template.spec.restartPolicy value is "Always".
// 描述被Deployment管理的Pod,在Deployment没有什么含义,会被传递给ReplicaSet
Template v1.PodTemplateSpec `json:"template" protobuf:"bytes,3,opt,name=template"`

// The deployment strategy to use to replace existing pods with new ones.
// +optional
// +patchStrategy=retainKeys
// 部署策略,可选有:RollingUpdate(滚动部署)、Recreate(在创建新的Pod前终止旧的Pod)
Strategy DeploymentStrategy `json:"strategy,omitempty" patchStrategy:"retainKeys" protobuf:"bytes,4,opt,name=strategy"`

// Minimum number of seconds for which a newly created pod should be ready
// without any of its container crashing, for it to be considered available.
// Defaults to 0 (pod will be considered available as soon as it is ready)
// +optional
// 新建的Pod在就绪之后还要等待的最小秒数,等待MinReadySeconds之后,才创建下一个Pod,default=0
MinReadySeconds int32 `json:"minReadySeconds,omitempty" protobuf:"varint,5,opt,name=minReadySeconds"`

// The number of old ReplicaSets to retain to allow rollback.
// This is a pointer to distinguish between explicit zero and not specified.
// Defaults to 10.
// +optional
// 保留允许的回滚的旧的RelicaSet数量,default=10
RevisionHistoryLimit *int32 `json:"revisionHistoryLimit,omitempty" protobuf:"varint,6,opt,name=revisionHistoryLimit"`

// Indicates that the deployment is paused.
// +optional
// 用于暂停部署的开关
// 执行kubectl rollout pause会设置为true
// 执行kubectl rollout resume设置为false
Paused bool `json:"paused,omitempty" protobuf:"varint,7,opt,name=paused"`

// The maximum time in seconds for a deployment to make progress before it
// is considered to be failed. The deployment controller will continue to
// process failed deployments and a condition with a ProgressDeadlineExceeded
// reason will be surfaced in the deployment status. Note that progress will
// not be estimated during the time a deployment is paused. Defaults to 600s.
// 设置的超时时间,默认600
// 在应用十分钟没有进展,标记为失败,并且停止
ProgressDeadlineSeconds *int32 `json:"progressDeadlineSeconds,omitempty" protobuf:"varint,9,opt,name=progressDeadlineSeconds"`
}

PodTempldate

PodTempldate是K8s顶级资源对象

// PodTemplateSpec describes the data a pod should have when created from a template
type PodTemplateSpec struct {
// Metadata of the pods created from this template.
// +optional
metav1.ObjectMeta

// Spec defines the behavior of a pod.
// +optional
Spec PodSpec
}

DeploymentStragegy

DeploymentStragegy包含俩个字段

// DeploymentStrategy describes how to replace existing pods with new ones.
type DeploymentStrategy struct {
// Type of deployment. Can be "Recreate" or "RollingUpdate". Default is RollingUpdate.
// +optional
Type DeploymentStrategyType `json:"type,omitempty" protobuf:"bytes,1,opt,name=type,casttype=DeploymentStrategyType"`

// Rolling update config params. Present only if DeploymentStrategyType =
// RollingUpdate.
//---
// TODO: Update this to follow our convention for oneOf, whatever we decide it
// to be.
// +optional
// 可以设置MaxUnavailable:更新期间最大不可用Pod数量,值可以是数字和百分比
// 如果MaxSugre=9,则MaxUnavailable不可为0,默认25%,即可用Pod至少为75%

// MaxSugre:可以调用的超过所需Pod最大数量,值可以说数字和百分比
// 如果MaxUnavailable=0,MaxSugre不可为0.默认25%,最多Pod为125%
RollingUpdate *RollingUpdateDeployment `json:"rollingUpdate,omitempty" protobuf:"bytes,2,opt,name=rollingUpdate"`
}

设置MaxUnavailable=0,MaxSugre=1。每次先启动一个新Pod,才会删除一个旧的Pod,服务滚动过程比较平滑。不影响业务

计算MaxUnavailable和MaxSurge

  1. MaxUnavailable和MaxSurge俩个func计算
  2. 内部都调用ResovleFenceposts,同时返回俩个值
  3. ResovleFenceposts内部调用GetScaledValueFromIntOrPercent func进行解析
    1. 如果是数字直接返回
    2. 如果是百分比和总副本数相除,根据传入参数向上或者向下取整
  4. 如果俩个值为0,MAxUnavailable设置为0
  • Ref:https://github.com/kubernetes/kubernetes/blob/88e994f6bf8fc88114c5b733e09afea339bea66d/pkg/controller/deployment/util/deployment_util.go#L436

    // MaxUnavailable returns the maximum unavailable pods a rolling deployment can take.
    func MaxUnavailable(deployment apps.Deployment) int32 {
    if !IsRollingUpdate(&deployment) || *(deployment.Spec.Replicas) == 0 {
    return int32(0)
    }
    // Error caught by validation
    _, maxUnavailable, _ := ResolveFenceposts(deployment.Spec.Strategy.RollingUpdate.MaxSurge, deployment.Spec.Strategy.RollingUpdate.MaxUnavailable, *(deployment.Spec.Replicas))
    if maxUnavailable > *deployment.Spec.Replicas {
    return *deployment.Spec.Replicas
    }
    return maxUnavailable
    }
    ...
    // MaxSurge returns the maximum surge pods a rolling deployment can take.
    func MaxSurge(deployment apps.Deployment) int32 {
    if !IsRollingUpdate(&deployment) {
    return int32(0)
    }
    // Error caught by validation
    maxSurge, _, _ := ResolveFenceposts(deployment.Spec.Strategy.RollingUpdate.MaxSurge, deployment.Spec.Strategy.RollingUpdate.MaxUnavailable, *(deployment.Spec.Replicas))
    return maxSurge
    }
    ...
    // ResolveFenceposts resolves both maxSurge and maxUnavailable. This needs to happen in one
    // step. For example:
    //
    // 2 desired, max unavailable 1%, surge 0% - should scale old(-1), then new(+1), then old(-1), then new(+1)
    // 1 desired, max unavailable 1%, surge 0% - should scale old(-1), then new(+1)
    // 2 desired, max unavailable 25%, surge 1% - should scale new(+1), then old(-1), then new(+1), then old(-1)
    // 1 desired, max unavailable 25%, surge 1% - should scale new(+1), then old(-1)
    // 2 desired, max unavailable 0%, surge 1% - should scale new(+1), then old(-1), then new(+1), then old(-1)
    // 1 desired, max unavailable 0%, surge 1% - should scale new(+1), then old(-1)
    // Ref:https://github.com/kubernetes/kubernetes/blob/88e994f6bf8fc88114c5b733e09afea339bea66d/pkg/controller/deployment/util/deployment_util.go#L841C1-L870C1
    func ResolveFenceposts(maxSurge, maxUnavailable *intstrutil.IntOrString, desired int32) (int32, int32, error) {
    surge, err := intstrutil.GetScaledValueFromIntOrPercent(intstrutil.ValueOrDefault(maxSurge, intstrutil.FromInt(0)), int(desired), true)
    if err != nil {
    return 0, 0, err
    }
    unavailable, err := intstrutil.GetScaledValueFromIntOrPercent(intstrutil.ValueOrDefault(maxUnavailable, intstrutil.FromInt(0)), int(desired), false)
    if err != nil {
    return 0, 0, err
    }

    if surge == 0 && unavailable == 0 {
    // Validation should never allow the user to explicitly use zero values for both maxSurge
    // maxUnavailable. Due to rounding down maxUnavailable though, it may resolve to zero.
    // If both fenceposts resolve to zero, then we should set maxUnavailable to 1 on the
    // theory that surge might not work due to quota.
    unavailable = 1
    }

    return int32(surge), int32(unavailable), nil
    }

滚动更新

Deployment 的滚动更新本质上是新旧版本的ReplicaSet完成的,更新分成Scale UpScale Down

  • Scale Up:新的RelicaSet朝着deployment.Spec.Replicas指定的数据递增
  • Scale Down:旧的RelicaSetrplicas朝着0递减
  • 一次完整的滚动更新,需要经历多次的Scale Up和Scale Down

新旧RS不是通过创建时间区分,而是调用EqualIgnoreHash Func实现,把rs.Spec.Template 和 deployment.Spec.Template 进行对比

rs.Spec.Template和deployment.Spec.Template进行对比

找到一致,设置为新的RS,找不到一致,新建新的RS

Scale Up

// NewRSNewReplicas calculates the number of replicas a deployment's new RS should have.
// When one of the followings is true, we're rolling out the deployment; otherwise, we're scaling it.
// 1) The new RS is saturated: newRS's replicas == deployment's replicas
// 2) Max number of pods allowed is reached: deployment's replicas + maxSurge == all RSs' replicas
func NewRSNewReplicas(deployment *apps.Deployment, allRSs []*apps.ReplicaSet, newRS *apps.ReplicaSet) (int32, error) {
// 根据滚动更新策略,根据MaxSurge 计算
switch deployment.Spec.Strategy.Type {
case apps.RollingUpdateDeploymentStrategyType:
// Check if we can scale up.
maxSurge, err := intstrutil.GetScaledValueFromIntOrPercent(deployment.Spec.Strategy.RollingUpdate.MaxSurge, int(*(deployment.Spec.Replicas)), true)
if err != nil {
return 0, err
}
// Find the total number of pods
// 计算当前Deploy 的Pod数
currentPodCount := GetReplicaCountForReplicaSets(allRSs)
maxTotalPods := *(deployment.Spec.Replicas) + int32(maxSurge)
if currentPodCount >= maxTotalPods {
// Cannot scale up.
return *(newRS.Spec.Replicas), nil
}
// Scale up.
// 需要扩容的数量
scaleUpCount := maxTotalPods - currentPodCount
// Do not exceed the number of desired replicas.
scaleUpCount = int32(integer.IntMin(int(scaleUpCount), int(*(deployment.Spec.Replicas)-*(newRS.Spec.Replicas))))
return *(newRS.Spec.Replicas) + scaleUpCount, nil
// 重建策略,直接返回副本数
case apps.RecreateDeploymentStrategyType:
return *(deployment.Spec.Replicas), nil
default:
return 0, fmt.Errorf("deployment type %v isn't supported", deployment.Spec.Strategy.Type)
}
}

Scale Down

缩容的场景有俩种:

func (dc *DeploymentController) reconcileOldReplicaSets(ctx context.Context, allRSs []*apps.ReplicaSet, oldRSs []*apps.ReplicaSet, newRS *apps.ReplicaSet, deployment *apps.Deployment) (bool, error) {
oldPodsCount := deploymentutil.GetReplicaCountForReplicaSets(oldRSs)
if oldPodsCount == 0 {
// Can't scale down further
return false, nil
}

allPodsCount := deploymentutil.GetReplicaCountForReplicaSets(allRSs)
klog.V(4).Infof("New replica set %s/%s has %d available pods.", newRS.Namespace, newRS.Name, newRS.Status.AvailableReplicas)
maxUnavailable := deploymentutil.MaxUnavailable(*deployment)

// Check if we can scale down. We can scale down in the following 2 cases:
// * Some old replica sets have unhealthy replicas, we could safely scale down those unhealthy replicas since that won't further
// increase unavailability.
// * New replica set has scaled up and it's replicas becomes ready, then we can scale down old replica sets in a further step.
//
// maxScaledDown := allPodsCount - minAvailable - newReplicaSetPodsUnavailable
// take into account not only maxUnavailable and any surge pods that have been created, but also unavailable pods from
// the newRS, so that the unavailable pods from the newRS would not make us scale down old replica sets in a further
// step(that will increase unavailability).
//
// Concrete example:
//
// * 10 replicas
// * 2 maxUnavailable (absolute number, not percent)
// * 3 maxSurge (absolute number, not percent)
//
// case 1:
// * Deployment is updated, newRS is created with 3 replicas, oldRS is scaled down to 8, and newRS is scaled up to 5.
// * The new replica set pods crashloop and never become available.
// * allPodsCount is 13. minAvailable is 8. newRSPodsUnavailable is 5.
// * A node fails and causes one of the oldRS pods to become unavailable. However, 13 - 8 - 5 = 0, so the oldRS won't be scaled down.
// * The user notices the crashloop and does kubectl rollout undo to rollback.
// * newRSPodsUnavailable is 1, since we rolled back to the good replica set, so maxScaledDown = 13 - 8 - 1 = 4. 4 of the crashlooping pods will be scaled down.
// * The total number of pods will then be 9 and the newRS can be scaled up to 10.
//
// case 2:
// Same example, but pushing a new pod template instead of rolling back (aka "roll over"):
// * The new replica set created must start with 0 replicas because allPodsCount is already at 13.
// * However, newRSPodsUnavailable would also be 0, so the 2 old replica sets could be scaled down by 5 (13 - 8 - 0), which would then
// allow the new replica set to be scaled up by 5.
minAvailable := *(deployment.Spec.Replicas) - maxUnavailable
newRSUnavailablePodCount := *(newRS.Spec.Replicas) - newRS.Status.AvailableReplicas
maxScaledDown := allPodsCount - minAvailable - newRSUnavailablePodCount
if maxScaledDown <= 0 {
return false, nil
}

// Clean up unhealthy replicas first, otherwise unhealthy replicas will block deployment
// and cause timeout. See https://github.com/kubernetes/kubernetes/issues/16737
// 情况一:删除不健康的旧RS中Pod
oldRSs, cleanupCount, err := dc.cleanupUnhealthyReplicas(ctx, oldRSs, deployment, maxScaledDown)
if err != nil {
return false, nil
}
klog.V(4).Infof("Cleaned up unhealthy replicas from old RSes by %d", cleanupCount)

// Scale down old replica sets, need check maxUnavailable to ensure we can scale down
// 情况二:根据Max Unavailable、Pod就绪状态等因素缩容旧RS
allRSs = append(oldRSs, newRS)
scaledDownCount, err := dc.scaleDownOldReplicaSetsForRollingUpdate(ctx, allRSs, oldRSs, deployment)
if err != nil {
return false, nil
}
klog.V(4).Infof("Scaled down old RSes of deployment %s by %d", deployment.Name, scaledDownCount)
// 总数量=情况一+情况二
totalScaledDown := cleanupCount + scaledDownCount
return totalScaledDown > 0, nil
}

重新创建更新策略

重新创建重新更新测策略:把旧的RS中Pod全部删除,再创建心等待RS中的Pod

  • scaleDownOldReplicaSetsForRecreate:把旧的RS,内部调用scaleReplicaSetAndRecordEvent,传入新副本数0.即全部缩容

  • scaleUpNewReplicationSetForRecreate:负责创建新的RS,内部调用scaleReplicaSetAndRecordEvent,传入新的副本数量Deployment中RS,全部创建新的Pod

  • 更新完成,清理历史RS,最多保留spec.RevisionHistoryLimit个历史版本

  • 更新Deployment状态,执行kubectl describe查看滚动更新信息

  • Ref:https://github.com/kubernetes/kubernetes/blob/88e994f6bf8fc88114c5b733e09afea339bea66d/pkg/controller/deployment/recreate.go#L29

    // rolloutRecreate implements the logic for recreating a replica set.
    func (dc *DeploymentController) rolloutRecreate(ctx context.Context, d *apps.Deployment, rsList []*apps.ReplicaSet, podMap map[types.UID][]*v1.Pod) error {
    // Don't create a new RS if not already existed, so that we avoid scaling up before scaling down.
    newRS, oldRSs, err := dc.getAllReplicaSetsAndSyncRevision(ctx, d, rsList, false)
    if err != nil {
    return err
    }
    allRSs := append(oldRSs, newRS)
    activeOldRSs := controller.FilterActiveReplicaSets(oldRSs)

    // scale down old replica sets.
    scaledDown, err := dc.scaleDownOldReplicaSetsForRecreate(ctx, activeOldRSs, d)
    if err != nil {
    return err
    }
    if scaledDown {
    // Update DeploymentStatus.
    return dc.syncRolloutStatus(ctx, allRSs, newRS, d)
    }

    // Do not process a deployment when it has old pods running.
    if oldPodsRunning(newRS, oldRSs, podMap) {
    return dc.syncRolloutStatus(ctx, allRSs, newRS, d)
    }

    // If we need to create a new RS, create it now.
    if newRS == nil {
    newRS, oldRSs, err = dc.getAllReplicaSetsAndSyncRevision(ctx, d, rsList, true)
    if err != nil {
    return err
    }
    allRSs = append(oldRSs, newRS)
    }

    // scale up new replica set.
    if _, err := dc.scaleUpNewReplicaSetForRecreate(ctx, newRS, d); err != nil {
    return err
    }

    if util.DeploymentComplete(d, &d.Status) {
    if err := dc.cleanupDeployment(ctx, oldRSs, d); err != nil {
    return err
    }
    }

    // Sync deployment status.
    return dc.syncRolloutStatus(ctx, allRSs, newRS, d)
    }

MinReadySeconds

MinReadySeconds的作用是在Scale Up的过程中,新创建的Pod处于Ready状态的基础上,等待MinReadySeconds才被认为可用状态

  • 防止新建Pod发生崩溃,在更新过程,影响服务可用性