Updated Kubernetes to v1.4.3

kayrus · kayrus · commit 84cf1ce2bd5d · 2016-10-21T10:42:25.000+02:00
* Updated kube-dns to v20 + use deployment instead of RC
* Fixed issues with coreos cluster checker
* Added Kubernetes cluster readiness checker
diff --git a/deploy_coreos_cluster.sh b/deploy_coreos_cluster.sh
@@ -162,6 +162,7 @@ trap - EXIT
 trap
 
 OS_NAME="coreos"
+SSH_USER="core"
 
 virsh list --all --name | grep -q "^${OS_NAME}1$" && { print_red "'${OS_NAME}1' VM already exists"; exit 1; }
 
@@ -365,10 +366,11 @@ if [ "x${SKIP_SSH_CHECK}" = "x" ]; then
       TRY=$((TRY+1))
       if [ $TRY -gt $MAX_SSH_TRIES ]; then
         print_red "Can not connect to ssh, exiting..."
+        exit 1
       fi
       echo "Trying to connect to ${VM_HOSTNAME} VM, #${TRY} of #${MAX_SSH_TRIES}..."
       set +e
-      RES=$(LANG=en_US ssh -l $SSH_USER -o ConnectTimeout=1 -o PasswordAuthentication=no -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -i ${PRIV_KEY_PATH} $VM_HOSTNAME "uptime" 2>&1)
+      RES=$(LANG=en_US ssh -l $SSH_USER -o BatchMode=yes -o ConnectTimeout=1 -o PasswordAuthentication=no -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -i ${PRIV_KEY_PATH} $VM_HOSTNAME "uptime" 2>&1)
       RES_CODE=$?
       set -e
       if [ $RES_CODE -eq 0 ]; then
diff --git a/deploy_k8s_cluster.sh b/deploy_k8s_cluster.sh
@@ -175,6 +175,7 @@ OS_NAME="coreos"
 PREFIX="k8s"
 MASTER_PREFIX="${PREFIX}-master"
 NODE_PREFIX="${PREFIX}-node"
+SSH_USER="core"
 
 virsh list --all --name | grep -q "^${PREFIX}-[mn]" && { print_red "'$PREFIX-*' VMs already exist"; exit 1; }
 
@@ -300,7 +301,7 @@ if [ -n "$OPTVAL_CPU" ]; then
   CPUs=$OPTVAL_CPU
 fi
 
-K8S_RELEASE="v1.3.5"
+K8S_RELEASE="v1.4.3"
 K8S_IMAGE="gcr.io/google_containers/hyperkube:${K8S_RELEASE}"
 FLANNEL_TYPE=vxlan
 
@@ -445,6 +446,7 @@ done
 
 if [ "x${SKIP_SSH_CHECK}" = "x" ]; then
   MAX_SSH_TRIES=50
+  MAX_KUBECTL_TRIES=200
   for SEQ in $(seq 1 $CLUSTER_SIZE); do
     if [ "$SEQ" = "1" ]; then
       VM_HOSTNAME=$MASTER_PREFIX
@@ -457,10 +459,11 @@ if [ "x${SKIP_SSH_CHECK}" = "x" ]; then
       TRY=$((TRY+1))
       if [ $TRY -gt $MAX_SSH_TRIES ]; then
         print_red "Can not connect to ssh, exiting..."
+        exit 1
       fi
       echo "Trying to connect to ${VM_HOSTNAME} VM, #${TRY} of #${MAX_SSH_TRIES}..."
       set +e
-      RES=$(LANG=en_US ssh -l $SSH_USER -o ConnectTimeout=1 -o PasswordAuthentication=no -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -i ${PRIV_KEY_PATH} $VM_HOSTNAME "uptime" 2>&1)
+      RES=$(LANG=en_US ssh -l $SSH_USER -o BatchMode=yes -o ConnectTimeout=1 -o PasswordAuthentication=no -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -i ${PRIV_KEY_PATH} $VM_HOSTNAME "uptime" 2>&1)
       RES_CODE=$?
       set -e
       if [ $RES_CODE -eq 0 ]; then
@@ -470,7 +473,34 @@ if [ "x${SKIP_SSH_CHECK}" = "x" ]; then
       fi
     done
   done
-  print_green "Cluster of $CLUSTER_SIZE $OS_NAME nodes is up and running."
+  print_green "Cluster of $CLUSTER_SIZE $OS_NAME nodes is up and running, waiting for Kubernetes to be ready..."
+  for SEQ in $(seq 1 $CLUSTER_SIZE); do
+    if [ "$SEQ" = "1" ]; then
+      VM_HOSTNAME=$MASTER_PREFIX
+    else
+      NODE_SEQ=$[SEQ-1]
+      VM_HOSTNAME="${NODE_PREFIX}-$NODE_SEQ"
+    fi
+    TRY=0
+    while true; do
+      TRY=$((TRY+1))
+      if [ $TRY -gt $MAX_KUBECTL_TRIES ]; then
+        print_red "Can not verify Kubernetes status, exiting..."
+        exit 1
+      fi
+      echo "Trying to check whether ${VM_HOSTNAME} Kubernetes node is up and running, #${TRY} of #${MAX_KUBECTL_TRIES}..."
+      set +e
+      RES=$(LANG=en_US ssh -l $SSH_USER -o BatchMode=yes -o ConnectTimeout=1 -o PasswordAuthentication=no -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -i ${PRIV_KEY_PATH} $MASTER_PREFIX "/opt/bin/kubectl get nodes $VM_HOSTNAME | grep -q Ready" 2>&1)
+      RES_CODE=$?
+      set -e
+      if [ $RES_CODE -eq 0 ]; then
+        break
+      else
+        sleep 1
+      fi
+    done
+  done
+  print_green "Kubernetes cluster is up and running..."
 fi
 
 print_green "Use following command to connect to your cluster: 'ssh -i \"$PRIV_KEY_PATH\" core@$COREOS_MASTER_HOSTNAME'"
diff --git a/deploy_vms_cluster.sh b/deploy_vms_cluster.sh
@@ -227,7 +227,7 @@ runcmd:
   ubuntu)
     BOOT_HOOK="runcmd:
   - service networking restart"
-    handle_channel_release yakkety current
+    handle_channel_release xenial current
     # extra size for images
     IMG_SIZE="10G"
     IMG_NAME="${CHANNEL}-server-cloudimg-amd64.qcow2"
@@ -520,10 +520,11 @@ if [ "x${SKIP_SSH_CHECK}" = "x" ]; then
       TRY=$((TRY+1))
       if [ $TRY -gt $MAX_SSH_TRIES ]; then
         print_red "Can not connect to ssh, exiting..."
+        exit 1
       fi
       echo "Trying to connect to ${VM_HOSTNAME} VM, #${TRY} of #${MAX_SSH_TRIES}..."
       set +e
-      RES=$(LANG=en_US ssh -l $SSH_USER -o ConnectTimeout=1 -o PasswordAuthentication=no -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -i ${PRIV_KEY_PATH} $VM_HOSTNAME "uptime" 2>&1)
+      RES=$(LANG=en_US ssh -l $SSH_USER -o BatchMode=yes -o ConnectTimeout=1 -o PasswordAuthentication=no -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -i ${PRIV_KEY_PATH} $VM_HOSTNAME "uptime" 2>&1)
       RES_CODE=$?
       set -e
       if [ $RES_CODE -eq 0 ]; then
diff --git a/k8s_master.yaml b/k8s_master.yaml
@@ -96,124 +96,124 @@ write-files:
         - name: dns-tcp
           port: 53
           protocol: TCP
-
       ---
-
-      apiVersion: v1
-      kind: ReplicationController
+      apiVersion: extensions/v1beta1
+      kind: Deployment
       metadata:
-        name: kube-dns-v11
+        name: kube-dns-v20
         namespace: kube-system
         labels:
           k8s-app: kube-dns
-          version: v11
+          version: v20
           kubernetes.io/cluster-service: "true"
       spec:
+        strategy:
+          type: RollingUpdate
+          rollingUpdate:
+            # Ensure we have at least 1 alive pod during update (don't kill old pod until new pod is up and running)
+            maxSurge: 1
+            maxUnavailable: 0
         replicas: 1
         selector:
-          k8s-app: kube-dns
-          version: v11
+          matchLabels:
+            k8s-app: kube-dns
+            version: v20
         template:
           metadata:
             labels:
               k8s-app: kube-dns
-              version: v11
-              kubernetes.io/cluster-service: "true"
+              version: v20
+            annotations:
+              scheduler.alpha.kubernetes.io/critical-pod: ''
+              scheduler.alpha.kubernetes.io/tolerations: '[{"key":"CriticalAddonsOnly", "operator":"Exists"}]'
           spec:
             containers:
-            - name: etcd
-              image: gcr.io/google_containers/etcd:2.2.1
-              resources:
-                # keep request = limit to keep this container in guaranteed class
-                limits:
-                  cpu: 100m
-                  memory: 50Mi
-                requests:
-                  cpu: 100m
-                  memory: 50Mi
-              command:
-              - /usr/local/bin/etcd
-              - -data-dir
-              - /var/etcd/data
-              - -listen-client-urls
-              - http://127.0.0.1:2379,http://127.0.0.1:4001
-              - -advertise-client-urls
-              - http://127.0.0.1:2379,http://127.0.0.1:4001
-              - -initial-cluster-token
-              - skydns-etcd
-              volumeMounts:
-              - name: etcd-storage
-                mountPath: /var/etcd/data
-            - name: kube2sky
-              image: gcr.io/google_containers/kube2sky:1.14
+            - name: kubedns
+              image: gcr.io/google_containers/kubedns-amd64:1.8
               resources:
-                # keep request = limit to keep this container in guaranteed class
+                # TODO: Set memory limits when we've profiled the container for large
+                # clusters, then set request = limit to keep this container in
+                # guaranteed class. Currently, this container falls into the
+                # "burstable" category so the kubelet doesn't backoff from restarting it.
                 limits:
-                  cpu: 100m
-                  memory: 50Mi
+                  memory: 170Mi
                 requests:
                   cpu: 100m
-                  memory: 50Mi
-              args:
-              # command = "/kube2sky"
-              - --domain=%K8S_DOMAIN%
-            - name: skydns
-              image: gcr.io/google_containers/skydns:2015-10-13-8c72f8c
-              resources:
-                # keep request = limit to keep this container in guaranteed class
-                limits:
-                  cpu: 100m
-                  memory: 50Mi
-                requests:
-                  cpu: 100m
-                  memory: 50Mi
-              command: ["sh", "-c", "while true; do echo -e \"PUT /v2/keys/skydns/config HTTP/1.1\r\nAccept: */*\r\nContent-Length: 26\r\nContent-Type: application/x-www-form-urlencoded\r\n\r\nvalue=%7B%22ndot%22%3A1%7D\" | nc localhost 2379 2>&1 | grep 'HTTP/1.1 200 OK' && /skydns --machines=http://127.0.0.1:2379 --addr=0.0.0.0:53 --ns-rotate=false --domain=%K8S_DOMAIN%. ; sleep 1; done"]
-#              args:
-#              # command = "/skydns"
-#              - --machines=http://127.0.0.1:2379
-#              - --addr=0.0.0.0:53
-#              - --ns-rotate=false
-#              - --domain=%K8S_DOMAIN%.
-              ports:
-              - containerPort: 53
-                name: dns
-                protocol: UDP
-              - containerPort: 53
-                name: dns-tcp
-                protocol: TCP
+                  memory: 70Mi
               livenessProbe:
                 httpGet:
-                  path: /healthz
+                  path: /healthz-kubedns
                   port: 8080
                   scheme: HTTP
-                initialDelaySeconds: 30
+                initialDelaySeconds: 60
                 timeoutSeconds: 5
+                successThreshold: 1
+                failureThreshold: 5
               readinessProbe:
                 httpGet:
-                  path: /healthz
+                  path: /readiness
+                  port: 8081
+                  scheme: HTTP
+                # we poll on pod startup for the Kubernetes master service and
+                # only setup the /readiness HTTP server once that's available.
+                initialDelaySeconds: 3
+                timeoutSeconds: 5
+              args:
+              # command = "/kube-dns"
+              - --domain=%K8S_DOMAIN%.
+              - --dns-port=10053
+              ports:
+              - containerPort: 10053
+                name: dns-local
+                protocol: UDP
+              - containerPort: 10053
+                name: dns-tcp-local
+                protocol: TCP
+            - name: dnsmasq
+              image: gcr.io/google_containers/kube-dnsmasq-amd64:1.4
+              livenessProbe:
+                httpGet:
+                  path: /healthz-dnsmasq
                   port: 8080
                   scheme: HTTP
-                initialDelaySeconds: 1
+                initialDelaySeconds: 60
                 timeoutSeconds: 5
+                successThreshold: 1
+                failureThreshold: 5
+              args:
+              - --cache-size=1000
+              - --no-resolv
+              - --server=127.0.0.1#10053
+              - --log-facility=-
+              ports:
+              - containerPort: 53
+                name: dns
+                protocol: UDP
+              - containerPort: 53
+                name: dns-tcp
+                protocol: TCP
             - name: healthz
-              image: gcr.io/google_containers/exechealthz:1.0
+              image: gcr.io/google_containers/exechealthz-amd64:1.2
               resources:
-                # keep request = limit to keep this container in guaranteed class
                 limits:
-                  cpu: 10m
-                  memory: 20Mi
+                  memory: 50Mi
                 requests:
                   cpu: 10m
-                  memory: 20Mi
+                  # Note that this container shouldn't really need 50Mi of memory. The
+                  # limits are set higher than expected pending investigation on #29688.
+                  # The extra memory was stolen from the kubedns container to keep the
+                  # net memory requested by the pod constant.
+                  memory: 50Mi
               args:
-              - -cmd=nslookup kubernetes.default.svc.%K8S_DOMAIN% 127.0.0.1 >/dev/null
-              - -port=8080
+              - --cmd=nslookup kubernetes.default.svc.%K8S_DOMAIN% 127.0.0.1 >/dev/null
+              - --url=/healthz-dnsmasq
+              - --cmd=nslookup kubernetes.default.svc.%K8S_DOMAIN% 127.0.0.1:10053 >/dev/null
+              - --url=/healthz-kubedns
+              - --port=8080
+              - --quiet
               ports:
               - containerPort: 8080
                 protocol: TCP
-            volumes:
-            - name: etcd-storage
-              emptyDir: {}
             dnsPolicy: Default  # Don't use cluster DNS.
   - path: /etc/kubernetes/manifests/kube-apiserver.yaml
     permissions: '0644'