Source Code – https://github.com/devopsschool-demo-labs-projects/kubernetes-lab-demo/tree/main/Clusters-Troubleshooting
Troubleshooting Kubernetes Nodes
ssh aen@c1-cp1 | |
cd ~/content/course/04/demo/ | |
#Run the code in 1-TroubleshootingNodesBreakstuff.sh...there's a readme inside this file. | |
#This script will implement a breaking change on each worker node in the cluster | |
#You'll need to update the login username for this to work. | |
sh 1-TroubleshootingNodesBreakStuff.sh 'aen' | |
# Worker Node Troubleshooting Scenario 1 | |
#It can take a minute for the node's status to change to NotReady...wait until they are. | |
#Except for the Control Plane Node, all of the Nodes' statuses are NotReady, let's check out why... | |
kubectl get nodes | |
#Remember the Control Plane Node still has a kubelet and runs pods... | |
#so this toubleshooting methodology can apply there. | |
#Let's start troubleshooting c1-node1's issues. | |
#ssh into c1-node1 | |
ssh aen@c1-node1 | |
#The kubelet runs as a systemd service/unit...so we can use those tools to troubleshoot why it's not working | |
#Let's start by checking the status. Add no-pager so it will wrap the text | |
#It's loaded, but it's inactive (dead)...so that means it's not running. | |
#We want the service to be active (running) | |
#So the first thing to check is the service enabled? | |
sudo systemctl status kubelet.service | |
#If the service wasn't configured to start up by default (disabled) we can use enable to set it to. | |
sudo systemctl enable kubelet.service | |
#That just enables the service to start up on boot, we could reboot now or we can start it manually | |
#So let's start it up and see what happens...ah, it's now actice (running) which means the kubelet is online. | |
#We also see in the journald snippet, that it's watching the apiserver. So good stuff there... | |
sudo systemctl start kubelet.service | |
sudo systemctl status kubelet.service | |
#Log out of the node and onto c1-cp1 | |
exit | |
#Back on c1-cp1, is c1-node1 reporting Ready? | |
kubectl get nodes | |
# Worker Node Troubleshooting Scenario 2 | |
ssh aen@c1-node2 | |
#Crashlooping kubelet...indicated by the code = exited and the status = 255 | |
#But that didn't tell us WHY the kubelet is crashlooping, just that it is...let's dig deeper | |
sudo systemctl status kubelet.service --no-pager | |
#systemd based systems write logs to journald, let's ask it for the logs for the kubelet | |
#This tells us exactly what's wrong, the failed to load the Kubelet config file | |
#which it thinks is at /var/lib/kubelet/config.yaml | |
sudo journalctl -u kubelet.service --no-pager | |
#Let's see what's in /var/lib/kubelet/...ah, look the kubelet wants config.yaml, but we have config.yml | |
sudo ls -la /var/lib/kubelet | |
#And now fixup that config by renaming the file and and restarting the kubelet | |
#Another option here would have been to edit the systemd unit configuration for the kubelet in /etc/systemd/system/kubelet.service.d/10-kubeadm.conf. | |
#We're going to look at that in the next demo below. | |
sudo mv /var/lib/kubelet/config.yml /var/lib/kubelet/config.yaml | |
sudo systemctl restart kubelet.service | |
#It should be Active(running) | |
sudo systemctl status kubelet.service | |
#...lets log out and check the node status | |
exit | |
#On c1-cp1, c1-node2 should be Ready. | |
kubectl get nodes | |
# Worker Node Troubleshooting Scenario 3 | |
ssh aen@c1-node3 | |
#Crashlooping again...let's dig deeper and grab the logs | |
sudo systemctl status kubelet.service --no-pager | |
#Using journalctl we can pull the logs...this time it's looking for config.yml... | |
sudo journalctl -u kubelet.service --no-pager | |
#Is config.yml in /var/lib/kublet? No, it's config.yaml...but I don't want to rename this because | |
#I wwant the filename so it matches all the configs on all my other nodes. | |
sudo ls -la /var/lib/kubelet | |
#Let's reconfigure where the kubelet looks for this config file | |
#Where is the kubelet config file specified?, check the systemd unit config for the kubelet | |
#Where does systemd think the kubelet's config.yaml is? | |
sudo systemctl status kubelet.service --no-pager | |
sudo more /etc/systemd/system/kubelet.service.d/10-kubeadm.conf | |
#Let's update the config args, inside here is the startup configuration for the kubelet | |
sudo vi /etc/systemd/system/kubelet.service.d/10-kubeadm.conf | |
#Let's restart the kubelet... | |
sudo systemctl restart kubelet | |
#But since we edited the unit file, we neede to reload the unit files (configs)...then restart the service | |
sudo systemctl daemon-reload | |
sudo systemctl restart kubelet | |
#Check the status...active and running? | |
sudo systemctl status kubelet.service | |
#Log out and back into c1-cp1 | |
exit | |
#check our Nodes' statuses | |
kubectl get nodes |
Troubleshooting Kubernetes Control Plane
ssh aen@c1-cp1 | |
cd ~/content/course/04/demo/ | |
#1 - Control Plane Pods Stopped | |
#Remember the Control Plane Node still has a kubelet and runs pods...if the kubelet's not running then troubleshoot that first. | |
#This section focuses on the control plane when it's running the control plane as pods | |
#Run this script on your Control Plane Node node to break the control plane | |
sh ./2-TroubleshootingControlPlaneBreakStuff-1.sh | |
#Let's check the status of our control plane pods...refused? | |
#It can take a a bit to break the control plane wait until it connection to server was refused | |
kubectl get pods --namespace kube-system | |
#Let's ask our container runtime, what's up...well there's pods running on this node, but no control plane pods. | |
#That's your clue...no control plane pods running...what starts up the control plane pods...static pod manifests | |
sudo crictl --runtime-endpoint unix:///run/containerd/containerd.sock ps | |
#Let's check config.yaml for the location of the static pod manifests | |
#Look for staticPodPath | |
#Do the yaml files exist at that location? | |
sudo more /var/lib/kubelet/config.yaml | |
#The directory doesn't exist...oh no! | |
sudo ls -laR /etc/kubernetes/manifests | |
#Let's look up one directory... | |
sudo ls -la /etc/kubernetes/ | |
#We could update config.yaml to point to this path or rename it to put the manifests in the configured location. | |
#The kubelet will find these manifests and launch the pods again. | |
sudo mv /etc/kubernetes/manifests.wrong /etc/kubernetes/manifests | |
sudo ls /etc/kubernetes/manifests/ | |
#Check the container runtime to ensure the pods are started...we can see they were created and running just a few seconds ago | |
sudo crictl --runtime-endpoint unix:///run/containerd/containerd.sock ps | |
#Let's ask kubernetes whats it thinks... | |
kubectl get pods -n kube-system | |
#2 - Troubleshooting control plane failure, user Pods are all pending. | |
#Break the control plane | |
sh 2-TroubleshootingControlPlaneBreakStuff-2.sh | |
#Let's start a workload | |
kubectl create deployment nginx --image=nginx | |
kubectl scale deployment nginx --replicas=4 | |
#Interesting, all of the pods are pending...why? | |
kubectl get pods | |
#Nodes look good? Yes, they're all reporting ready. | |
kubectl get nodes | |
#let's look at the pods' events...<none> nothing, no scheduling, no image pulling, no container starting...let's zoom out | |
kubectl describe pods | |
#What's the next step after the pods are created by the replication controler? Scheduling... | |
kubectl get events --sort-by='.metadata.creationTimestamp' | |
#So we know there's no scheduling events, let's check the control plane status...the scheduler isn't running | |
kubectl get pods --namespace=kube-system | |
#Let's check the events on that pod...we can see if failed fo pull the image for the scheduler, says image not found. | |
#Looks like the manifest is trying to pull an image that doesn't exist | |
kubectl describe pods --namespace kube-system kube-scheduler-c1-cp1 | |
#That's defined in the static pod manifest | |
sudo vi /etc/kubernetes/manifests/kube-scheduler.yaml | |
#Is the scheduler back online, yes, it's running | |
kubectl get pods --namespace=kube-system | |
#And our deployment is now up and running...might take a minute or two for the pods to start up. | |
kubectl get deployment | |
#Clean up our resources... | |
kubectl delete deployments.apps nginx |
Troubleshooting Kubernetes Workloads
ssh aen@c1-cp1 | |
cd ~/content/course/04/demo/ | |
#1 - Troubleshooting Pods | |
#Create the deployment we want to troubleshoot | |
kubectl apply -f deployment-1.yaml | |
#Scenario: | |
#You get a call from a user that says none of their pods are up and running can you help? | |
#Start troubleshooting... | |
#If no pods are up and running we need to find out why... | |
#No pods are Ready...0/3. | |
kubectl get deployment | |
#If no pods are running let's look at the pods more closely | |
kubectl get pods | |
#It's a bad image (Status is ErrImagePull or ImagePullBackOff)... | |
#This means are image isn't available or we have a config issue in our deployment | |
#Check out the events for more information: | |
#Failed to pull image "gcr.io/google-samples/hello-ap:1.0": rpc error: code = Unknown desc = Error response from daemon: manifest for gcr.io/google-samples/hello-ap:1.0 not found | |
#It's hello-ap rather than hello-app. | |
kubectl describe pods | |
#We can also look at the events to get this information | |
kubectl get events --sort-by='.metadata.creationTimestamp' | |
#DECLARATIVE SOLUTION: | |
#Apply the corrected manifest which points to the correct image | |
kubectl apply -f deployment-1-corrected.yaml | |
#IMPERATIVE SOLUTION: | |
#Change: - image: gcr.io/google-samples/hello-ap:1.0 | |
#To: - image: gcr.io/google-samples/hello-app:1.0 | |
kubectl edit deployment hello-world-1 | |
#3 of 3 should be up and ready! | |
kubectl get deployment | |
#Clean up this demo before moving on. | |
kubectl delete -f deployment-1-corrected.yaml | |
#2 - Troubleshooting Deployments | |
#Create the deployment we want to troubleshoot | |
kubectl apply -f deployment-2.yaml | |
#Scenario: | |
#You get a call...the pods are up but none are reporting 'Ready' and the app isn't accesible. | |
#Start troubleshooting by looking at why the Pods aren't Ready...all pods are 0/1 Ready meaning the container in the pod isn't ready. | |
kubectl get pods | |
#We can look at kubectl describe to quickly find out why? In the Events you see the readiness probe is failing...but why? | |
#What port is the Container Port? What is the Port configured in the Readiness Probe? Do they match? | |
#Readiness probe failed: Get http://192.168.222.225:8081/index.html: dial tcp 192.168.222.225:8081: connect: connection refused | |
kubectl describe pods | |
#DECLARATIVE SOLUTION: | |
#Deploy the corrected readiness probe | |
#This will cause a rollout since the pod spec changed. | |
kubectl apply -f deployment-2-corrected.yaml | |
#IMPERATIVE SOLUTION: | |
#This will cause a rollout since the pod spec changed. | |
#In the readinessProbe | |
#CHANGE: port: 8081 | |
#To: port: 8080 | |
kubectl edit deployment hello-world-2 | |
#Check the Pods, all should be 1/1 Ready. | |
kubectl get pods | |
#Clean up this demo | |
kubectl delete -f deployment-2-corrected.yaml | |
#3 - Storage - Failure to access persistant volume storage | |
#You'll need the NFS server that we configured in the course 'Configuring and Managing Kubernetes Storage and Scheduling' for this demo | |
#Create the deployment we want to troubleshoot | |
kubectl apply -f deployment-3.yaml | |
#Scenario: | |
#You get a call...the pod is scheduled but it is stuck in ContainerCreating Status | |
#Start troubleshooting...by check out the Pods state...ContainerCreated...ok let's check out the events | |
kubectl get pods | |
#Events show the error coming from the kubelet on the Node, you'll see that it said 'No such file or directory' | |
#Warning FailedMount 34s kubelet, c1-node3 MountVolume.SetUp failed for volume "pv-nfs-data" : mount failed: exit status 32 | |
#mount.nfs: mounting 172.16.94.5:/export/volumes/po failed, reason given by server: No such file or directory | |
kubectl describe pods | |
#We can also look at the events to get this information | |
kubectl get events --sort-by='.metadata.creationTimestamp' | |
#Deploy the fix which points to the correct nfs export, 172.16.94.5:/export/volumes/pod | |
#DECLARATIVE SOLUTION: | |
kubectl apply -f deployment-3-corrected.yaml | |
#(No IMPERATIVE SOLUTION for this one) | |
#That won't work...The PVC gets bound to the bad PV so delete the existing deployment, PVC and PV and then deploy the corrected manifest. | |
kubectl delete -f deployment-3.yaml | |
#Deploy the fix again | |
kubectl apply -f deployment-3-corrected.yaml | |
#This should be up and Running | |
kubectl get pods | |
#Clean up this demo | |
kubectl delete -f deployment-3-corrected.yaml | |
#4 - Scheduling | |
#Create the deployment we want to troubleshoot | |
kubectl apply -f deployment-4.yaml | |
#Scenario: User reports that some pods have started and some have not | |
#Start troubleshooting...check out the pods | |
#3 of the 6 pods are pending...why? We should look at the scheduler | |
kubectl get pods -o wide | |
#Get scheduler events...scroll up do you see any errors? | |
#Look for Warnings and Failures... | |
#0/4 nodes are available: 1 node(s) had taint {node-role.kubernetes.io/master: }, that the pod didn't tolerate, 3 Insufficient cpu | |
kubectl get events --sort-by='.metadata.creationTimestamp' | |
#Let's check out the Pods...what are the CPU Requests? It's current set to 1. How many CPUs are allocatable on the Node? Let's look at the Node for that | |
kubectl describe pods | |
#Check out the details of the Node to see it's resource allocations and current requests | |
#How much CPU is Allocatable:? Should be 2 if you're using our lab cluster. | |
#How much CPU is Allocated? 1250m or 1.25vCPU...we're out of CPU to allocate on the nodes and the three pending pods cannot start up. | |
kubectl describe nodes | |
#DECLARATIVE SOLUTION: | |
#We can either add more CPUs to the cluster or adjust the requests in our Pod Spec. | |
#Let's change the request to 500m or 1/2 a CPU. All pods should start | |
kubectl apply -f deployment-4-corrected.yaml | |
#IMPERATIVE SOLUTION: | |
#In Pod.Spec.Container.Resources.Requests | |
#CHANGE: cpu: "1" | |
#TO: cpu: "500m" | |
kubectl edit deployment hello-world-4 | |
#6 of 6 pods should be online...this start a rollout because the Pod Spec is updated | |
kubectl get pods | |
#Let's clean up this demo | |
kubectl delete -f deployment-4-corrected.yaml | |
#5 - Services - mismatching service selector and labels | |
#Create the deployment we want to troubleshoot | |
kubectl apply -f deployment-5.yaml | |
#Scenario: Pods are all online but users cannot connect to the service | |
#Start troubleshooting...let's see if we can access the service | |
#Get the Service's ClusterIP and store that for reuse. | |
SERVICEIP=$(kubectl get service hello-world-5 -o jsonpath='{ .spec.clusterIP }') | |
echo $SERVICEIP | |
#Access the service inside the cluster...connection refused...why? | |
curl http://$SERVICEIP | |
#Let's check out the endpoints behind the service...there's no endpoints. Why? | |
kubectl describe service hello-world-5 | |
kubectl get endpoints hello-world-5 | |
#Let's check the labels and selectors for the service and the pods | |
#The selector for the service is Selector: app=hello-world...now let's check the labels on the Pods | |
kubectl describe service hello-world-5 | |
#Do any of the labels match the selector for the service? | |
#No, the labels on the pods are app=hello-world-5 and the selector is app=hello-world | |
kubectl get pods --show-labels | |
#DECLARATIVE SOLUTION: | |
#We can edit the selector or change the labels...let's change the service selector so the pods don't need to restart | |
kubectl apply -f deployment-5-corrected.yaml | |
#IMPERATIVE SOLUTION: | |
#CHANGE: selector: | |
# app: hello-world | |
#TO: selector: | |
# app: hello-world-5 | |
kubectl edit service hello-world-5 | |
#We should have endpoints | |
kubectl get endpoints hello-world-5 | |
#Let's access the service, does it work? | |
SERVICEIP=$(kubectl get service hello-world-5 -o jsonpath='{ .spec.clusterIP }') | |
curl http://$SERVICEIP | |
#Clean up this demo | |
kubectl delete -f deployment-5-corrected.yaml | |
I’m a DevOps/SRE/DevSecOps/Cloud Expert passionate about sharing knowledge and experiences. I am working at Cotocus. I blog tech insights at DevOps School, travel stories at Holiday Landmark, stock market tips at Stocks Mantra, health and fitness guidance at My Medic Plus, product reviews at I reviewed , and SEO strategies at Wizbrand.
Please find my social handles as below;
Rajesh Kumar Personal Website
Rajesh Kumar at YOUTUBE
Rajesh Kumar at INSTAGRAM
Rajesh Kumar at X
Rajesh Kumar at FACEBOOK
Rajesh Kumar at LINKEDIN
Rajesh Kumar at PINTEREST
Rajesh Kumar at QUORA
Rajesh Kumar at WIZBRAND