ci-operator/templates/openshift/installer/cluster-launch-installer-e2e: Gather node console logs on AWS

wking · wking · commit e102a16d89ba · 2019-12-03T14:57:08.000-08:00
To help debug things like [1]: Dec 2 16:31:41.298: INFO: cluster upgrade is Failing: Cluster operator kube-apiserver is reporting a failure: NodeControllerDegraded: The master node(s) "ip-10-0-136-232.ec2.internal" not ready ... Kubelet stopped posting node status. where a node goes down but does not come back up far enough to reconnect as a node. Eventually, we'll address this with machine-health checks, killing the non-responsive machine and automatically replacing it with a new one. That's currently waiting on an etcd operator that can handle reconnecting control-plane machines automatically. But in the short term, and possibly still in the long term, it's nice to collect what we can from the broken machine to understand why it didn't come back up. This code isn't specific to broken machines, but collecting console logs from all nodes should cover us in the broken-machine case as well. [1]: https://bugzilla.redhat.com/show_bug.cgi?id=1778904
diff --git a/ci-operator/templates/openshift/installer/cluster-launch-installer-e2e.yaml b/ci-operator/templates/openshift/installer/cluster-launch-installer-e2e.yaml
@@ -785,6 +785,10 @@ objects:
         value: /etc/openshift-installer/gce.json
       - name: KUBECONFIG
         value: /tmp/artifacts/installer/auth/kubeconfig
+      - name: USER
+        value: test
+      - name: HOME
+        value: /tmp
       command:
       - /bin/bash
       - -c
@@ -852,6 +856,7 @@ objects:
           fi
 
           oc --insecure-skip-tls-verify --request-timeout=5s get nodes -o jsonpath --template '{range .items[*]}{.metadata.name}{"\n"}{end}' > /tmp/nodes
+          oc --insecure-skip-tls-verify --request-timeout=5s get nodes -o jsonpath --template '{range .items[*]}{.spec.providerID}{"\n"}{end}' | sed 's|.*/||' > /tmp/node-provider-IDs
           oc --insecure-skip-tls-verify --request-timeout=5s get pods --all-namespaces --template '{{ range .items }}{{ $name := .metadata.name }}{{ $ns := .metadata.namespace }}{{ range .spec.containers }}-n {{ $ns }} {{ $name }} -c {{ .name }}{{ "\n" }}{{ end }}{{ range .spec.initContainers }}-n {{ $ns }} {{ $name }} -c {{ .name }}{{ "\n" }}{{ end }}{{ end }}' > /tmp/containers
           oc --insecure-skip-tls-verify --request-timeout=5s get pods -l openshift.io/component=api --all-namespaces --template '{{ range .items }}-n {{ .metadata.namespace }} {{ .metadata.name }}{{ "\n" }}{{ end }}' > /tmp/pods-api
 
@@ -892,6 +897,21 @@ objects:
             queue /tmp/artifacts/nodes/$i/heap oc --insecure-skip-tls-verify get --request-timeout=20s --raw /api/v1/nodes/$i/proxy/debug/pprof/heap
           done < /tmp/nodes
 
+          if [[ "${CLUSTER_TYPE}" = "aws" ]]; then
+            # FIXME: get epel-release or otherwise add awscli to our teardown image
+            export PATH="${HOME}/.local/bin:${PATH}"
+            easy_install --user pip  # our Python 2.7.5 is even too old for ensurepip
+            pip install --user awscli
+            export AWS_REGION="$(python -c 'import json; data = json.load(open("/tmp/artifacts/installer/metadata.json")); print(data["aws"]["region"])')"
+          fi
+
+          while IFS= read -r i; do
+            mkdir -p "/tmp/artifacts/nodes/${i}"
+            if [[ "${CLUSTER_TYPE}" = "aws" ]]; then
+              queue /tmp/artifacts/nodes/$i/console aws ec2 get-console-output --instance-id "${i}"
+            fi
+          done < /tmp/node-provider-IDs
+
           FILTER=gzip queue /tmp/artifacts/nodes/masters-journal.gz oc --insecure-skip-tls-verify adm node-logs --role=master --unify=false
           FILTER=gzip queue /tmp/artifacts/nodes/workers-journal.gz oc --insecure-skip-tls-verify adm node-logs --role=worker --unify=false