Skip to content

Add zero downtime deployment #5338

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Nov 1, 2022
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions ansible/group_vars/all
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ controller:
authentication:
spi: "{{ controller_authentication_spi | default('') }}"
loglevel: "{{ controller_loglevel | default(whisk_loglevel) | default('INFO') }}"
username: "{{ controller_username | default('controller.user') }}"
password: "{{ controller_password | default('controller.pass') }}"
entitlement:
spi: "{{ controller_entitlement_spi | default('') }}"
protocol: "{{ controller_protocol | default('https') }}"
Expand All @@ -126,6 +128,10 @@ controller:
password: "openwhisk"
name: "{{ __controller_ssl_keyPrefix }}openwhisk-keystore.p12"
extraEnv: "{{ controller_extraEnv | default({}) }}"
deployment:
ignore_error: "{{ controller_deployment_ignore_error | default('False') }}"
retries: "{{ controller_deployment_retries | default(180) }}"
delay: "{{ controller_deployment_delay | default(5) }}"

jmx:
basePortController: 15000
Expand Down Expand Up @@ -234,6 +240,10 @@ invoker:
creationMaxPeek: "{{ container_creation_max_peek | default(500) }}"
reactiveSpi: "{{ invokerReactive_spi | default('') }}"
serverSpi: "{{ invokerServer_spi | default('') }}"
deployment:
ignore_error: "{{ invoker_deployment_ignore_error | default('False') }}"
retries: "{{ invoker_deployment_retries | default(180) }}"
delay: "{{ invoker_deployment_delay | default(5) }}"

userLogs:
spi: "{{ userLogs_spi | default('org.apache.openwhisk.core.containerpool.logging.DockerToActivationLogStoreProvider') }}"
Expand Down Expand Up @@ -450,8 +460,7 @@ metrics:
user_events: "{{ user_events_enabled | default(false) | lower }}"

zeroDowntimeDeployment:
enabled: "{{ zerodowntime_deployment_switch | default(true) }}"
solution: "{{ zerodowntime_deployment_solution | default('apicall') }}"
enabled: "{{ zerodowntime_deployment_switch | default(false) }}"

etcd:
version: "{{ etcd_version | default('v3.4.0') }}"
Expand Down
83 changes: 83 additions & 0 deletions ansible/roles/controller/tasks/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,8 @@
"CONFIG_whisk_info_date": "{{ whisk.version.date }}"
"CONFIG_whisk_info_buildNo": "{{ docker.image.tag }}"
"CONFIG_whisk_cluster_name": "{{ whisk.cluster_name | lower }}"
"CONFIG_whisk_controller_username": "{{ controller.username }}"
"CONFIG_whisk_controller_password": "{{ controller.password }}"

"KAFKA_HOSTS": "{{ kafka_connect_string }}"
"CONFIG_whisk_kafka_replicationFactor":
Expand Down Expand Up @@ -363,6 +365,53 @@
include_tasks: "lean.yml"
when: lean

# Before redeploy controller, should remove that controller instance from nginx
- name: remove the controller from nginx's upstream configuration
shell:
docker exec -t nginx sh -c "sed -i \"s/ server {{ ansible_host }}:{{ controller.basePort + (controller_index | int) }}/ \#server {{ ansible_host }}:{{ controller.basePort + (controller_index | int) }}/g\" /etc/nginx/nginx.conf && nginx -s reload"
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Personally, I don't like this approach, but this is a kind of limitation because opensource nginx does not support active health check by default.
It is related to #5337

delegate_to: "{{ item }}"
with_items: "{{ groups['edge'] }}"
when: zeroDowntimeDeployment.enabled == true

- name: wait some time for controllers fire all existing triggers
shell: sleep 5s
when: zeroDowntimeDeployment.enabled == true

- name: wait until {{ controller_name }} executes all existing activations
uri:
url: "{{ controller.protocol }}://{{ ansible_host }}:{{ controller.basePort + (controller_index | int) }}/activation/count"
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The controller would not be deployed until there is no inflight activation.

validate_certs: no
client_key: "{{ controller.confdir }}/controller{{ groups['controllers'].index(inventory_hostname) }}/{{ controller.ssl.key }}"
client_cert: "{{ controller.confdir }}/controller{{ groups['controllers'].index(inventory_hostname) }}/{{ controller.ssl.cert }}"
return_content: yes
user: "{{ controller.username }}"
password: "{{ controller.password }}"
force_basic_auth: yes
register: result
until: result.content == '0'
retries: "{{ controller.deployment.retries }}"
delay: "{{ controller.deployment.delay }}"
when: zeroDowntimeDeployment.enabled == true
ignore_errors: "{{ controller.deployment.ignore_error }}"

- name: Disable {{ controller_name }} before remove controller
uri:
url: "{{ controller.protocol }}://{{ ansible_host }}:{{ controller.basePort + groups['controllers'].index(inventory_hostname) }}/disable"
validate_certs: no
client_key: "{{ controller.confdir }}/{{ controller_name }}/{{ controller.ssl.key }}"
client_cert: "{{ controller.confdir }}/{{ controller_name }}/{{ controller.ssl.cert }}"
method: POST
status_code: 200
user: "{{ controller.username }}"
password: "{{ controller.password }}"
force_basic_auth: yes
ignore_errors: "{{ controller.deployment.ignore_error }}"
when: zeroDowntimeDeployment.enabled == true

- name: wait some time for controller to gracefully shutdown the consumer for activation ack
shell: sleep 5s
when: zeroDowntimeDeployment.enabled == true

- name: (re)start controller
docker_container:
name: "{{ controller_name }}"
Expand Down Expand Up @@ -397,3 +446,37 @@
until: result.status == 200
retries: 12
delay: 10

- name: warm up activation path
uri:
url:
"{{controller.protocol}}://{{ lookup('file', '{{ catalog_auth_key }}')}}@{{ansible_host}}:{{controller_port}}/api/v1/namespaces/_/actions/invokerHealthTestAction{{controller_index}}?blocking=false&result=false"
validate_certs: "no"
client_key:
"{{ controller.confdir }}/{{ controller_name }}/{{ controller.ssl.key }}"
client_cert:
"{{ controller.confdir }}/{{ controller_name }}/{{ controller.ssl.cert }}"
method: POST
ignore_errors: True

- name: wait for all invokers in {{ controller_name }} to become up
uri:
url: "{{ controller.protocol }}://{{ ansible_host }}:{{ controller.basePort + (controller_index | int) }}/invokers"
validate_certs: no
client_key: "{{ controller.confdir }}/controller{{ groups['controllers'].index(inventory_hostname) }}/{{ controller.ssl.key }}"
client_cert: "{{ controller.confdir }}/controller{{ groups['controllers'].index(inventory_hostname) }}/{{ controller.ssl.cert }}"
return_content: yes
register: invokerStatus
until: invokerStatus.json|length >= 1 and "unhealthy" not in invokerStatus.content
retries: 14
delay: 5
when: zeroDowntimeDeployment.enabled == true

# When all invokers report their status to controller, add the controller instance to nginx when exist at least one invoker is up
- name: Add the controller back to nginx's upstream configuration when there exist at least one healthy invoker
shell:
docker exec -t nginx sh -c "sed -i \"s/ \#server {{ ansible_host }}:{{ controller.basePort + (controller_index | int) }}/ server {{ ansible_host }}:{{ controller.basePort + (controller_index | int) }}/g\" /etc/nginx/nginx.conf && nginx -s reload"
delegate_to: "{{ item }}"
with_items: "{{ groups['edge'] }}"
ignore_errors: True
when: zeroDowntimeDeployment.enabled == true and "up" in invokerStatus.content
33 changes: 33 additions & 0 deletions ansible/roles/invoker/tasks/clean.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,37 @@
invoker_name: "{{ name_prefix ~ ((invoker_index_base | int) + host_group.index(inventory_hostname)) }}"
invoker_index: "{{ (invoker_index_base | int) + host_group.index(inventory_hostname) }}"

- name: disable invoker{{ groups['invokers'].index(inventory_hostname) }}
uri:
url: "{{ invoker.protocol }}://{{ ansible_host }}:{{ invoker.port + groups['invokers'].index(inventory_hostname) }}/disable"
validate_certs: no
client_key: "{{ invoker.confdir }}/invoker{{ groups['invokers'].index(inventory_hostname) }}/{{ invoker.ssl.key }}"
client_cert: "{{ invoker.confdir }}/invoker{{ groups['invokers'].index(inventory_hostname) }}/{{ invoker.ssl.cert }}"
method: POST
status_code: 200
user: "{{ invoker.username }}"
password: "{{ invoker.password }}"
force_basic_auth: yes
ignore_errors: "{{ invoker.deployment.ignore_error }}"
when: zeroDowntimeDeployment.enabled == true and enable_scheduler

- name: wait invoker{{ groups['invokers'].index(inventory_hostname) }} to clean up all existing containers
uri:
url: "{{ invoker.protocol }}://{{ ansible_host }}:{{ invoker.port + groups['invokers'].index(inventory_hostname) }}/pool/count"
validate_certs: no
client_key: "{{ invoker.confdir }}/invoker{{ groups['invokers'].index(inventory_hostname) }}/{{ invoker.ssl.key }}"
client_cert: "{{ invoker.confdir }}/invoker{{ groups['invokers'].index(inventory_hostname) }}/{{ invoker.ssl.cert }}"
user: "{{ invoker.username }}"
password: "{{ invoker.password }}"
force_basic_auth: yes
return_content: yes
register: result
until: result.content == '0'
retries: "{{ invoker.deployment.retries }}"
delay: "{{ invoker.deployment.delay }}"
when: zeroDowntimeDeployment.enabled == true and enable_scheduler
ignore_errors: "{{ invoker.deployment.ignore_error }}"

- name: remove invoker
docker_container:
name: "{{ invoker_name }}"
Expand Down Expand Up @@ -59,12 +90,14 @@
path: "{{ whisk_logs_dir }}/{{ invoker_name }}"
state: absent
become: "{{ logs.dir.become }}"
when: mode == "clean"

- name: remove invoker conf directory
file:
path: "{{ invoker.confdir }}/{{ invoker_name }}"
state: absent
become: "{{ invoker.dir.become }}"
when: mode == "clean"

# Workaround for orphaned ifstate.veth* files on Ubuntu 14.04
# See https://github.com/moby/moby/issues/22513
Expand Down
6 changes: 6 additions & 0 deletions ansible/roles/invoker/tasks/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@
---
# This role installs invokers.

###
# When the zero-downtime-deployment is enabled, clean.yml is used to gracefully shut down the invoker.
#
- import_tasks: clean.yml
when: zeroDowntimeDeployment.enabled == true and enable_scheduler

- import_tasks: docker_login.yml

- name: get invoker name and index
Expand Down
21 changes: 6 additions & 15 deletions ansible/roles/schedulers/tasks/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -280,11 +280,6 @@
include_tasks: "{{ item }}.yml"
with_items: "{{ scheduler_plugins | default([]) }}"

- name: Judge current scheduler whether deployed
shell: echo $(docker ps | grep {{ scheduler_name }} | wc -l)
register: schedulerDeployed
when: zeroDowntimeDeployment.enabled == true

- name: disable scheduler{{ groups['schedulers'].index(inventory_hostname) }} before redeploy scheduler
uri:
url: "{{ scheduler.protocol }}://{{ ansible_host }}:{{ scheduler_port }}/disable"
Expand All @@ -295,27 +290,23 @@
password: "{{ scheduler.password }}"
force_basic_auth: yes
ignore_errors: "{{ scheduler.deployment_ignore_error }}"
when: zeroDowntimeDeployment.enabled == true and schedulerDeployed.stdout != "0"
when: zeroDowntimeDeployment.enabled == true

- name: wait until all queue and create queue task is finished before redeploy scheduler when using apicall solution or half solution
- name: wait until all activation is finished before redeploy scheduler
uri:
url: "{{ scheduler.protocol }}://{{ ansible_host }}:{{ scheduler_port }}/queue/total"
url: "{{ scheduler.protocol }}://{{ ansible_host }}:{{ scheduler_port }}/activation/count"
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's the same with other components.
It will wait until all activations are handled.

validate_certs: no
return_content: yes
user: "{{ scheduler.username }}"
password: "{{ scheduler.password }}"
force_basic_auth: yes
register: totalQueue
until: totalQueue.content == "0"
register: result
until: result.content == "0"
retries: 180
delay: 5
when: zeroDowntimeDeployment.enabled == true and schedulerDeployed.stdout != "0"
when: zeroDowntimeDeployment.enabled == true
ignore_errors: "{{ scheduler.deployment_ignore_error }}"

- name: wait until all queue and create queue task is finished before redeploy scheduler using sleep solution
shell: sleep 120s
when: zeroDowntimeDeployment.enabled == true and schedulerDeployed.stdout != "0" and zeroDowntimeDeployment.solution == 'sleep'

- name: (re)start scheduler
docker_container:
name: "{{ scheduler_name }}"
Expand Down
4 changes: 4 additions & 0 deletions ansible/templates/whisk.properties.j2
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,15 @@ edge.host.apiport=443
kafkaras.host.port={{ kafka.ras.port }}
redis.host.port={{ redis.port }}
invoker.hosts.basePort={{ invoker.port }}
invoker.username={{ invoker.username }}
invoker.password={{ invoker.password }}

controller.hosts={{ groups["controllers"] | map('extract', hostvars, 'ansible_host') | list | join(",") }}
controller.host.basePort={{ controller.basePort }}
controller.instances={{ controller.instances }}
controller.protocol={{ controller.protocol }}
controller.username={{ controller.username }}
controller.password={{ controller.password }}

invoker.container.network=bridge
invoker.container.policy={{ invoker_container_policy_name | default()}}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,9 @@ object ConfigKeys {

val dataManagementServiceRetryInterval = "whisk.scheduler.data-management-service.retry-interval"

val whiskControllerUsername = "whisk.controller.username"
val whiskControllerPassword = "whisk.controller.password"

val whiskSchedulerUsername = "whisk.scheduler.username"
val whiskSchedulerPassword = "whisk.scheduler.password"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -532,9 +532,13 @@ object InvokerResourceMessage extends DefaultJsonProtocol {
* ...
* ]
*/
object StatusQuery
object GetState

case class StatusData(invocationNamespace: String, fqn: String, waitingActivation: Int, status: String, data: String)
case class StatusData(invocationNamespace: String,
fqn: String,
waitingActivation: List[ActivationId],
status: String,
data: String)
extends Message {

override def serialize: String = StatusData.serdes.write(this).compactPrint
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@ package org.apache.openwhisk.core.service
import akka.actor.{Actor, ActorRef, ActorSystem, Props}
import com.ibm.etcd.api.Event.EventType
import com.ibm.etcd.client.kv.WatchUpdate
import org.apache.openwhisk.common.Logging
import org.apache.openwhisk.common.{GracefulShutdown, Logging}
import org.apache.openwhisk.core.etcd.EtcdClient
import org.apache.openwhisk.core.etcd.EtcdType._

import scala.collection.JavaConverters._
import scala.collection.concurrent.TrieMap

Expand Down Expand Up @@ -141,6 +142,13 @@ class WatcherService(etcdClient: EtcdClient)(implicit logging: Logging, actorSys
// always send WatcherClosed back to sender if it need a feedback
if (request.needFeedback)
sender ! WatcherClosed(request.watchKey, request.isPrefix)

case GracefulShutdown =>
watcher.close()
putWatchers.clear()
deleteWatchers.clear()
prefixPutWatchers.clear()
prefixDeleteWatchers.clear()
}
}

Expand Down
4 changes: 4 additions & 0 deletions core/controller/src/main/resources/application.conf
Original file line number Diff line number Diff line change
Expand Up @@ -122,4 +122,8 @@ whisk{
file-system : true
dir-path : "/swagger-ui/"
}
controller {
username: "controller.user"
password: "controller.pass"
}
}
Loading