Skip to content

Commit 651a2e9

Browse files
style95bdoyle0182
andauthored
Add zero downtime deployment (#5338)
* Deploy controllers without downtime * Deploy invokers without downtime * Deploy schedulers without downtime * Fix typo * Fix typo * Add a disable API to controllers * Remove unnecessary steps * Add more logs for container liveness * Change Set to thread-safe one * Use the transaction ID of the activation * Gracefully shutdown activation client proxy * Update core/invoker/src/main/scala/org/apache/openwhisk/core/containerpool/v2/ActivationClientProxy.scala Apply suggestion Co-authored-by: Brendan Doyle <[email protected]> * Update core/invoker/src/main/scala/org/apache/openwhisk/core/containerpool/v2/ActivationClientProxy.scala Apply suggestion Co-authored-by: Brendan Doyle <[email protected]> * Update core/invoker/src/main/scala/org/apache/openwhisk/core/containerpool/v2/ActivationClientProxy.scala Co-authored-by: Brendan Doyle <[email protected]> * Apply #5334 * Remove akka-http dependency from the invoker reactive * Exclude the prewarm containers count from the /pool/count route * Add missing import * Make it compatible with scala-2.13 In scala-2.13 mapValues returns a MapView, and it cannot be cast to Map by default. * Fix test cases * Add container id to the logs of ActivationClientProxy Co-authored-by: Brendan Doyle <[email protected]>
1 parent 74ca61c commit 651a2e9

File tree

38 files changed

+607
-233
lines changed

38 files changed

+607
-233
lines changed

ansible/group_vars/all

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,8 @@ controller:
112112
authentication:
113113
spi: "{{ controller_authentication_spi | default('') }}"
114114
loglevel: "{{ controller_loglevel | default(whisk_loglevel) | default('INFO') }}"
115+
username: "{{ controller_username | default('controller.user') }}"
116+
password: "{{ controller_password | default('controller.pass') }}"
115117
entitlement:
116118
spi: "{{ controller_entitlement_spi | default('') }}"
117119
protocol: "{{ controller_protocol | default('https') }}"
@@ -126,6 +128,10 @@ controller:
126128
password: "openwhisk"
127129
name: "{{ __controller_ssl_keyPrefix }}openwhisk-keystore.p12"
128130
extraEnv: "{{ controller_extraEnv | default({}) }}"
131+
deployment:
132+
ignore_error: "{{ controller_deployment_ignore_error | default('False') }}"
133+
retries: "{{ controller_deployment_retries | default(180) }}"
134+
delay: "{{ controller_deployment_delay | default(5) }}"
129135

130136
jmx:
131137
basePortController: 15000
@@ -234,6 +240,10 @@ invoker:
234240
creationMaxPeek: "{{ container_creation_max_peek | default(500) }}"
235241
reactiveSpi: "{{ invokerReactive_spi | default('') }}"
236242
serverSpi: "{{ invokerServer_spi | default('') }}"
243+
deployment:
244+
ignore_error: "{{ invoker_deployment_ignore_error | default('False') }}"
245+
retries: "{{ invoker_deployment_retries | default(180) }}"
246+
delay: "{{ invoker_deployment_delay | default(5) }}"
237247

238248
userLogs:
239249
spi: "{{ userLogs_spi | default('org.apache.openwhisk.core.containerpool.logging.DockerToActivationLogStoreProvider') }}"
@@ -450,8 +460,7 @@ metrics:
450460
user_events: "{{ user_events_enabled | default(false) | lower }}"
451461

452462
zeroDowntimeDeployment:
453-
enabled: "{{ zerodowntime_deployment_switch | default(true) }}"
454-
solution: "{{ zerodowntime_deployment_solution | default('apicall') }}"
463+
enabled: "{{ zerodowntime_deployment_switch | default(false) }}"
455464

456465
etcd:
457466
version: "{{ etcd_version | default('v3.4.0') }}"

ansible/roles/controller/tasks/deploy.yml

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,8 @@
155155
"CONFIG_whisk_info_date": "{{ whisk.version.date }}"
156156
"CONFIG_whisk_info_buildNo": "{{ docker.image.tag }}"
157157
"CONFIG_whisk_cluster_name": "{{ whisk.cluster_name | lower }}"
158+
"CONFIG_whisk_controller_username": "{{ controller.username }}"
159+
"CONFIG_whisk_controller_password": "{{ controller.password }}"
158160

159161
"KAFKA_HOSTS": "{{ kafka_connect_string }}"
160162
"CONFIG_whisk_kafka_replicationFactor":
@@ -363,6 +365,53 @@
363365
include_tasks: "lean.yml"
364366
when: lean
365367

368+
# Before redeploy controller, should remove that controller instance from nginx
369+
- name: remove the controller from nginx's upstream configuration
370+
shell:
371+
docker exec -t nginx sh -c "sed -i \"s/ server {{ ansible_host }}:{{ controller.basePort + (controller_index | int) }}/ \#server {{ ansible_host }}:{{ controller.basePort + (controller_index | int) }}/g\" /etc/nginx/nginx.conf && nginx -s reload"
372+
delegate_to: "{{ item }}"
373+
with_items: "{{ groups['edge'] }}"
374+
when: zeroDowntimeDeployment.enabled == true
375+
376+
- name: wait some time for controllers fire all existing triggers
377+
shell: sleep 5s
378+
when: zeroDowntimeDeployment.enabled == true
379+
380+
- name: wait until {{ controller_name }} executes all existing activations
381+
uri:
382+
url: "{{ controller.protocol }}://{{ ansible_host }}:{{ controller.basePort + (controller_index | int) }}/activation/count"
383+
validate_certs: no
384+
client_key: "{{ controller.confdir }}/controller{{ groups['controllers'].index(inventory_hostname) }}/{{ controller.ssl.key }}"
385+
client_cert: "{{ controller.confdir }}/controller{{ groups['controllers'].index(inventory_hostname) }}/{{ controller.ssl.cert }}"
386+
return_content: yes
387+
user: "{{ controller.username }}"
388+
password: "{{ controller.password }}"
389+
force_basic_auth: yes
390+
register: result
391+
until: result.content == '0'
392+
retries: "{{ controller.deployment.retries }}"
393+
delay: "{{ controller.deployment.delay }}"
394+
when: zeroDowntimeDeployment.enabled == true
395+
ignore_errors: "{{ controller.deployment.ignore_error }}"
396+
397+
- name: Disable {{ controller_name }} before remove controller
398+
uri:
399+
url: "{{ controller.protocol }}://{{ ansible_host }}:{{ controller.basePort + groups['controllers'].index(inventory_hostname) }}/disable"
400+
validate_certs: no
401+
client_key: "{{ controller.confdir }}/{{ controller_name }}/{{ controller.ssl.key }}"
402+
client_cert: "{{ controller.confdir }}/{{ controller_name }}/{{ controller.ssl.cert }}"
403+
method: POST
404+
status_code: 200
405+
user: "{{ controller.username }}"
406+
password: "{{ controller.password }}"
407+
force_basic_auth: yes
408+
ignore_errors: "{{ controller.deployment.ignore_error }}"
409+
when: zeroDowntimeDeployment.enabled == true
410+
411+
- name: wait some time for controller to gracefully shutdown the consumer for activation ack
412+
shell: sleep 5s
413+
when: zeroDowntimeDeployment.enabled == true
414+
366415
- name: (re)start controller
367416
docker_container:
368417
name: "{{ controller_name }}"
@@ -397,3 +446,37 @@
397446
until: result.status == 200
398447
retries: 12
399448
delay: 10
449+
450+
- name: warm up activation path
451+
uri:
452+
url:
453+
"{{controller.protocol}}://{{ lookup('file', '{{ catalog_auth_key }}')}}@{{ansible_host}}:{{controller_port}}/api/v1/namespaces/_/actions/invokerHealthTestAction{{controller_index}}?blocking=false&result=false"
454+
validate_certs: "no"
455+
client_key:
456+
"{{ controller.confdir }}/{{ controller_name }}/{{ controller.ssl.key }}"
457+
client_cert:
458+
"{{ controller.confdir }}/{{ controller_name }}/{{ controller.ssl.cert }}"
459+
method: POST
460+
ignore_errors: True
461+
462+
- name: wait for all invokers in {{ controller_name }} to become up
463+
uri:
464+
url: "{{ controller.protocol }}://{{ ansible_host }}:{{ controller.basePort + (controller_index | int) }}/invokers"
465+
validate_certs: no
466+
client_key: "{{ controller.confdir }}/controller{{ groups['controllers'].index(inventory_hostname) }}/{{ controller.ssl.key }}"
467+
client_cert: "{{ controller.confdir }}/controller{{ groups['controllers'].index(inventory_hostname) }}/{{ controller.ssl.cert }}"
468+
return_content: yes
469+
register: invokerStatus
470+
until: invokerStatus.json|length >= 1 and "unhealthy" not in invokerStatus.content
471+
retries: 14
472+
delay: 5
473+
when: zeroDowntimeDeployment.enabled == true
474+
475+
# When all invokers report their status to controller, add the controller instance to nginx when exist at least one invoker is up
476+
- name: Add the controller back to nginx's upstream configuration when there exist at least one healthy invoker
477+
shell:
478+
docker exec -t nginx sh -c "sed -i \"s/ \#server {{ ansible_host }}:{{ controller.basePort + (controller_index | int) }}/ server {{ ansible_host }}:{{ controller.basePort + (controller_index | int) }}/g\" /etc/nginx/nginx.conf && nginx -s reload"
479+
delegate_to: "{{ item }}"
480+
with_items: "{{ groups['edge'] }}"
481+
ignore_errors: True
482+
when: zeroDowntimeDeployment.enabled == true and "up" in invokerStatus.content

ansible/roles/invoker/tasks/clean.yml

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,37 @@
2222
invoker_name: "{{ name_prefix ~ ((invoker_index_base | int) + host_group.index(inventory_hostname)) }}"
2323
invoker_index: "{{ (invoker_index_base | int) + host_group.index(inventory_hostname) }}"
2424

25+
- name: disable invoker{{ groups['invokers'].index(inventory_hostname) }}
26+
uri:
27+
url: "{{ invoker.protocol }}://{{ ansible_host }}:{{ invoker.port + groups['invokers'].index(inventory_hostname) }}/disable"
28+
validate_certs: no
29+
client_key: "{{ invoker.confdir }}/invoker{{ groups['invokers'].index(inventory_hostname) }}/{{ invoker.ssl.key }}"
30+
client_cert: "{{ invoker.confdir }}/invoker{{ groups['invokers'].index(inventory_hostname) }}/{{ invoker.ssl.cert }}"
31+
method: POST
32+
status_code: 200
33+
user: "{{ invoker.username }}"
34+
password: "{{ invoker.password }}"
35+
force_basic_auth: yes
36+
ignore_errors: "{{ invoker.deployment.ignore_error }}"
37+
when: zeroDowntimeDeployment.enabled == true and enable_scheduler
38+
39+
- name: wait invoker{{ groups['invokers'].index(inventory_hostname) }} to clean up all existing containers
40+
uri:
41+
url: "{{ invoker.protocol }}://{{ ansible_host }}:{{ invoker.port + groups['invokers'].index(inventory_hostname) }}/pool/count"
42+
validate_certs: no
43+
client_key: "{{ invoker.confdir }}/invoker{{ groups['invokers'].index(inventory_hostname) }}/{{ invoker.ssl.key }}"
44+
client_cert: "{{ invoker.confdir }}/invoker{{ groups['invokers'].index(inventory_hostname) }}/{{ invoker.ssl.cert }}"
45+
user: "{{ invoker.username }}"
46+
password: "{{ invoker.password }}"
47+
force_basic_auth: yes
48+
return_content: yes
49+
register: result
50+
until: result.content == '0'
51+
retries: "{{ invoker.deployment.retries }}"
52+
delay: "{{ invoker.deployment.delay }}"
53+
when: zeroDowntimeDeployment.enabled == true and enable_scheduler
54+
ignore_errors: "{{ invoker.deployment.ignore_error }}"
55+
2556
- name: remove invoker
2657
docker_container:
2758
name: "{{ invoker_name }}"
@@ -59,12 +90,14 @@
5990
path: "{{ whisk_logs_dir }}/{{ invoker_name }}"
6091
state: absent
6192
become: "{{ logs.dir.become }}"
93+
when: mode == "clean"
6294

6395
- name: remove invoker conf directory
6496
file:
6597
path: "{{ invoker.confdir }}/{{ invoker_name }}"
6698
state: absent
6799
become: "{{ invoker.dir.become }}"
100+
when: mode == "clean"
68101

69102
# Workaround for orphaned ifstate.veth* files on Ubuntu 14.04
70103
# See https://github.com/moby/moby/issues/22513

ansible/roles/invoker/tasks/deploy.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,12 @@
1717
---
1818
# This role installs invokers.
1919

20+
###
21+
# When the zero-downtime-deployment is enabled, clean.yml is used to gracefully shut down the invoker.
22+
#
23+
- import_tasks: clean.yml
24+
when: zeroDowntimeDeployment.enabled == true and enable_scheduler
25+
2026
- import_tasks: docker_login.yml
2127

2228
- name: get invoker name and index

ansible/roles/schedulers/tasks/deploy.yml

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -280,11 +280,6 @@
280280
include_tasks: "{{ item }}.yml"
281281
with_items: "{{ scheduler_plugins | default([]) }}"
282282

283-
- name: Judge current scheduler whether deployed
284-
shell: echo $(docker ps | grep {{ scheduler_name }} | wc -l)
285-
register: schedulerDeployed
286-
when: zeroDowntimeDeployment.enabled == true
287-
288283
- name: disable scheduler{{ groups['schedulers'].index(inventory_hostname) }} before redeploy scheduler
289284
uri:
290285
url: "{{ scheduler.protocol }}://{{ ansible_host }}:{{ scheduler_port }}/disable"
@@ -295,27 +290,23 @@
295290
password: "{{ scheduler.password }}"
296291
force_basic_auth: yes
297292
ignore_errors: "{{ scheduler.deployment_ignore_error }}"
298-
when: zeroDowntimeDeployment.enabled == true and schedulerDeployed.stdout != "0"
293+
when: zeroDowntimeDeployment.enabled == true
299294

300-
- name: wait until all queue and create queue task is finished before redeploy scheduler when using apicall solution or half solution
295+
- name: wait until all activation is finished before redeploy scheduler
301296
uri:
302-
url: "{{ scheduler.protocol }}://{{ ansible_host }}:{{ scheduler_port }}/queue/total"
297+
url: "{{ scheduler.protocol }}://{{ ansible_host }}:{{ scheduler_port }}/activation/count"
303298
validate_certs: no
304299
return_content: yes
305300
user: "{{ scheduler.username }}"
306301
password: "{{ scheduler.password }}"
307302
force_basic_auth: yes
308-
register: totalQueue
309-
until: totalQueue.content == "0"
303+
register: result
304+
until: result.content == "0"
310305
retries: 180
311306
delay: 5
312-
when: zeroDowntimeDeployment.enabled == true and schedulerDeployed.stdout != "0"
307+
when: zeroDowntimeDeployment.enabled == true
313308
ignore_errors: "{{ scheduler.deployment_ignore_error }}"
314309

315-
- name: wait until all queue and create queue task is finished before redeploy scheduler using sleep solution
316-
shell: sleep 120s
317-
when: zeroDowntimeDeployment.enabled == true and schedulerDeployed.stdout != "0" and zeroDowntimeDeployment.solution == 'sleep'
318-
319310
- name: (re)start scheduler
320311
docker_container:
321312
name: "{{ scheduler_name }}"

ansible/templates/whisk.properties.j2

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,11 +53,15 @@ edge.host.apiport=443
5353
kafkaras.host.port={{ kafka.ras.port }}
5454
redis.host.port={{ redis.port }}
5555
invoker.hosts.basePort={{ invoker.port }}
56+
invoker.username={{ invoker.username }}
57+
invoker.password={{ invoker.password }}
5658

5759
controller.hosts={{ groups["controllers"] | map('extract', hostvars, 'ansible_host') | list | join(",") }}
5860
controller.host.basePort={{ controller.basePort }}
5961
controller.instances={{ controller.instances }}
6062
controller.protocol={{ controller.protocol }}
63+
controller.username={{ controller.username }}
64+
controller.password={{ controller.password }}
6165

6266
invoker.container.network=bridge
6367
invoker.container.policy={{ invoker_container_policy_name | default()}}

common/scala/src/main/scala/org/apache/openwhisk/core/WhiskConfig.scala

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,9 @@ object ConfigKeys {
312312

313313
val dataManagementServiceRetryInterval = "whisk.scheduler.data-management-service.retry-interval"
314314

315+
val whiskControllerUsername = "whisk.controller.username"
316+
val whiskControllerPassword = "whisk.controller.password"
317+
315318
val whiskSchedulerUsername = "whisk.scheduler.username"
316319
val whiskSchedulerPassword = "whisk.scheduler.password"
317320

common/scala/src/main/scala/org/apache/openwhisk/core/connector/Message.scala

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -532,9 +532,13 @@ object InvokerResourceMessage extends DefaultJsonProtocol {
532532
* ...
533533
* ]
534534
*/
535-
object StatusQuery
535+
object GetState
536536

537-
case class StatusData(invocationNamespace: String, fqn: String, waitingActivation: Int, status: String, data: String)
537+
case class StatusData(invocationNamespace: String,
538+
fqn: String,
539+
waitingActivation: List[ActivationId],
540+
status: String,
541+
data: String)
538542
extends Message {
539543

540544
override def serialize: String = StatusData.serdes.write(this).compactPrint

common/scala/src/main/scala/org/apache/openwhisk/core/service/WatcherService.scala

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,10 @@ package org.apache.openwhisk.core.service
2020
import akka.actor.{Actor, ActorRef, ActorSystem, Props}
2121
import com.ibm.etcd.api.Event.EventType
2222
import com.ibm.etcd.client.kv.WatchUpdate
23-
import org.apache.openwhisk.common.Logging
23+
import org.apache.openwhisk.common.{GracefulShutdown, Logging}
2424
import org.apache.openwhisk.core.etcd.EtcdClient
2525
import org.apache.openwhisk.core.etcd.EtcdType._
26+
2627
import scala.collection.JavaConverters._
2728
import scala.collection.concurrent.TrieMap
2829

@@ -141,6 +142,13 @@ class WatcherService(etcdClient: EtcdClient)(implicit logging: Logging, actorSys
141142
// always send WatcherClosed back to sender if it need a feedback
142143
if (request.needFeedback)
143144
sender ! WatcherClosed(request.watchKey, request.isPrefix)
145+
146+
case GracefulShutdown =>
147+
watcher.close()
148+
putWatchers.clear()
149+
deleteWatchers.clear()
150+
prefixPutWatchers.clear()
151+
prefixDeleteWatchers.clear()
144152
}
145153
}
146154

core/controller/src/main/resources/application.conf

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,4 +122,8 @@ whisk{
122122
file-system : true
123123
dir-path : "/swagger-ui/"
124124
}
125+
controller {
126+
username: "controller.user"
127+
password: "controller.pass"
128+
}
125129
}

0 commit comments

Comments
 (0)