-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Add zero downtime deployment #5338
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 15 commits
c7d04ed
8310d54
a49c0a3
edab778
e76938b
cfe96fa
ec626d8
a7c359a
6e6d07d
9b0704f
393880a
ea848fc
d9bb23c
0c90289
baf0f54
9540ad5
c182975
fd208a3
c39dcd3
bad5904
e7a100b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -155,6 +155,8 @@ | |
"CONFIG_whisk_info_date": "{{ whisk.version.date }}" | ||
"CONFIG_whisk_info_buildNo": "{{ docker.image.tag }}" | ||
"CONFIG_whisk_cluster_name": "{{ whisk.cluster_name | lower }}" | ||
"CONFIG_whisk_controller_username": "{{ controller.username }}" | ||
"CONFIG_whisk_controller_password": "{{ controller.password }}" | ||
|
||
"KAFKA_HOSTS": "{{ kafka_connect_string }}" | ||
"CONFIG_whisk_kafka_replicationFactor": | ||
|
@@ -363,6 +365,53 @@ | |
include_tasks: "lean.yml" | ||
when: lean | ||
|
||
# Before redeploy controller, should remove that controller instance from nginx | ||
- name: remove the controller from nginx's upstream configuration | ||
shell: | ||
docker exec -t nginx sh -c "sed -i \"s/ server {{ ansible_host }}:{{ controller.basePort + (controller_index | int) }}/ \#server {{ ansible_host }}:{{ controller.basePort + (controller_index | int) }}/g\" /etc/nginx/nginx.conf && nginx -s reload" | ||
delegate_to: "{{ item }}" | ||
with_items: "{{ groups['edge'] }}" | ||
when: zeroDowntimeDeployment.enabled == true | ||
|
||
- name: wait some time for controllers fire all existing triggers | ||
shell: sleep 5s | ||
when: zeroDowntimeDeployment.enabled == true | ||
|
||
- name: wait until {{ controller_name }} executes all existing activations | ||
uri: | ||
url: "{{ controller.protocol }}://{{ ansible_host }}:{{ controller.basePort + (controller_index | int) }}/activation/count" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The controller would not be deployed until there is no inflight activation. |
||
validate_certs: no | ||
client_key: "{{ controller.confdir }}/controller{{ groups['controllers'].index(inventory_hostname) }}/{{ controller.ssl.key }}" | ||
client_cert: "{{ controller.confdir }}/controller{{ groups['controllers'].index(inventory_hostname) }}/{{ controller.ssl.cert }}" | ||
return_content: yes | ||
user: "{{ controller.username }}" | ||
password: "{{ controller.password }}" | ||
force_basic_auth: yes | ||
register: result | ||
until: result.content == '0' | ||
retries: "{{ controller.deployment.retries }}" | ||
delay: "{{ controller.deployment.delay }}" | ||
when: zeroDowntimeDeployment.enabled == true | ||
ignore_errors: "{{ controller.deployment.ignore_error }}" | ||
|
||
- name: Disable {{ controller_name }} before remove controller | ||
uri: | ||
url: "{{ controller.protocol }}://{{ ansible_host }}:{{ controller.basePort + groups['controllers'].index(inventory_hostname) }}/disable" | ||
validate_certs: no | ||
client_key: "{{ controller.confdir }}/{{ controller_name }}/{{ controller.ssl.key }}" | ||
client_cert: "{{ controller.confdir }}/{{ controller_name }}/{{ controller.ssl.cert }}" | ||
method: POST | ||
status_code: 200 | ||
user: "{{ controller.username }}" | ||
password: "{{ controller.password }}" | ||
force_basic_auth: yes | ||
ignore_errors: "{{ controller.deployment.ignore_error }}" | ||
when: zeroDowntimeDeployment.enabled == true | ||
|
||
- name: wait some time for controller to gracefully shutdown the consumer for activation ack | ||
shell: sleep 5s | ||
when: zeroDowntimeDeployment.enabled == true | ||
|
||
- name: (re)start controller | ||
docker_container: | ||
name: "{{ controller_name }}" | ||
|
@@ -397,3 +446,37 @@ | |
until: result.status == 200 | ||
retries: 12 | ||
delay: 10 | ||
|
||
- name: warm up activation path | ||
uri: | ||
url: | ||
"{{controller.protocol}}://{{ lookup('file', '{{ catalog_auth_key }}')}}@{{ansible_host}}:{{controller_port}}/api/v1/namespaces/_/actions/invokerHealthTestAction{{controller_index}}?blocking=false&result=false" | ||
validate_certs: "no" | ||
client_key: | ||
"{{ controller.confdir }}/{{ controller_name }}/{{ controller.ssl.key }}" | ||
client_cert: | ||
"{{ controller.confdir }}/{{ controller_name }}/{{ controller.ssl.cert }}" | ||
method: POST | ||
ignore_errors: True | ||
|
||
- name: wait for all invokers in {{ controller_name }} to become up | ||
uri: | ||
url: "{{ controller.protocol }}://{{ ansible_host }}:{{ controller.basePort + (controller_index | int) }}/invokers" | ||
validate_certs: no | ||
client_key: "{{ controller.confdir }}/controller{{ groups['controllers'].index(inventory_hostname) }}/{{ controller.ssl.key }}" | ||
client_cert: "{{ controller.confdir }}/controller{{ groups['controllers'].index(inventory_hostname) }}/{{ controller.ssl.cert }}" | ||
return_content: yes | ||
register: invokerStatus | ||
until: invokerStatus.json|length >= 1 and "unhealthy" not in invokerStatus.content | ||
retries: 14 | ||
delay: 5 | ||
when: zeroDowntimeDeployment.enabled == true | ||
|
||
# When all invokers report their status to controller, add the controller instance to nginx when exist at least one invoker is up | ||
- name: Add the controller back to nginx's upstream configuration when there exist at least one healthy invoker | ||
shell: | ||
docker exec -t nginx sh -c "sed -i \"s/ \#server {{ ansible_host }}:{{ controller.basePort + (controller_index | int) }}/ server {{ ansible_host }}:{{ controller.basePort + (controller_index | int) }}/g\" /etc/nginx/nginx.conf && nginx -s reload" | ||
delegate_to: "{{ item }}" | ||
with_items: "{{ groups['edge'] }}" | ||
ignore_errors: True | ||
when: zeroDowntimeDeployment.enabled == true and "up" in invokerStatus.content |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -280,11 +280,6 @@ | |
include_tasks: "{{ item }}.yml" | ||
with_items: "{{ scheduler_plugins | default([]) }}" | ||
|
||
- name: Judge current scheduler whether deployed | ||
shell: echo $(docker ps | grep {{ scheduler_name }} | wc -l) | ||
register: schedulerDeployed | ||
when: zeroDowntimeDeployment.enabled == true | ||
|
||
- name: disable scheduler{{ groups['schedulers'].index(inventory_hostname) }} before redeploy scheduler | ||
uri: | ||
url: "{{ scheduler.protocol }}://{{ ansible_host }}:{{ scheduler_port }}/disable" | ||
|
@@ -295,27 +290,23 @@ | |
password: "{{ scheduler.password }}" | ||
force_basic_auth: yes | ||
ignore_errors: "{{ scheduler.deployment_ignore_error }}" | ||
when: zeroDowntimeDeployment.enabled == true and schedulerDeployed.stdout != "0" | ||
when: zeroDowntimeDeployment.enabled == true | ||
|
||
- name: wait until all queue and create queue task is finished before redeploy scheduler when using apicall solution or half solution | ||
- name: wait until all activation is finished before redeploy scheduler | ||
uri: | ||
url: "{{ scheduler.protocol }}://{{ ansible_host }}:{{ scheduler_port }}/queue/total" | ||
url: "{{ scheduler.protocol }}://{{ ansible_host }}:{{ scheduler_port }}/activation/count" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's the same with other components. |
||
validate_certs: no | ||
return_content: yes | ||
user: "{{ scheduler.username }}" | ||
password: "{{ scheduler.password }}" | ||
force_basic_auth: yes | ||
register: totalQueue | ||
until: totalQueue.content == "0" | ||
register: result | ||
until: result.content == "0" | ||
retries: 180 | ||
delay: 5 | ||
when: zeroDowntimeDeployment.enabled == true and schedulerDeployed.stdout != "0" | ||
when: zeroDowntimeDeployment.enabled == true | ||
ignore_errors: "{{ scheduler.deployment_ignore_error }}" | ||
|
||
- name: wait until all queue and create queue task is finished before redeploy scheduler using sleep solution | ||
shell: sleep 120s | ||
when: zeroDowntimeDeployment.enabled == true and schedulerDeployed.stdout != "0" and zeroDowntimeDeployment.solution == 'sleep' | ||
|
||
- name: (re)start scheduler | ||
docker_container: | ||
name: "{{ scheduler_name }}" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Personally, I don't like this approach, but this is a kind of limitation because opensource nginx does not support active health check by default.
It is related to #5337