Skip to content

Commit 940a7a7

Browse files
ci: [NPM] improve scale pipeline & fix edge case in scale script (#1975)
* ci: check if directory is empty before applying it * ci: don't wait for pods if they weren't created * docs: fix script name * ci: wip for enabling linux scale test * ci: parameters for linux vs windows * ci: adjust params * ci: fix bash typo * ci: fix cp * ci: fix npm url * ci: increase max pods for linux nodepool * ci: start building windows image again * tmp: use apply netpol in background image * Revert "tmp: use apply netpol in background image" This reverts commit eff43c5. * refactor: use CLUSTER_NAME variable * ci: require succeeded() for scale & conformance tests * test: fix vars used in test-scale.sh checks * ci: disable linux, reenable windows * ci: increase sleep before waiting for NPM to start & log info when it doesn't * ci: better log capture & remove command from other pipeline * ci: do not get logs of npm on kwok nodes * ci: do not get logs of npm on kwok nodes (part 2)
1 parent e6eeb50 commit 940a7a7

File tree

4 files changed

+113
-61
lines changed

4 files changed

+113
-61
lines changed

.pipelines/npm/npm-conformance-tests.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -509,6 +509,7 @@ jobs:
509509
azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION)
510510
scriptType: "bash"
511511
scriptLocation: "inlineScript"
512+
condition: succeeded()
512513
inlineScript: |
513514
echo Deleting $(RESOURCE_GROUP)
514515
az group delete -n $(RESOURCE_GROUP) --yes

.pipelines/npm/npm-scale-test.yaml

+84-51
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,10 @@ jobs:
4646
name: "$(BUILD_POOL_NAME_DEFAULT)"
4747
strategy:
4848
matrix:
49+
npm_linux_amd64:
50+
arch: amd64
51+
name: npm
52+
os: linux
4953
npm_windows2022_amd64:
5054
arch: amd64
5155
name: npm
@@ -74,8 +78,14 @@ jobs:
7478
FQDN: empty
7579
strategy:
7680
matrix:
77-
v2-windows:
78-
PROFILE: "scale-win"
81+
# v2-linux:
82+
# PROFILE: "sc-lin"
83+
# NUM_NETPOLS: 800
84+
# INITIAL_CONNECTIVITY_TIMEOUT: 60
85+
ws22:
86+
PROFILE: "sc-ws22"
87+
NUM_NETPOLS: 50
88+
INITIAL_CONNECTIVITY_TIMEOUT: 720
7989
steps:
8090
- checkout: self
8191
- bash: |
@@ -115,44 +125,46 @@ jobs:
115125
az extension add --name aks-preview
116126
az extension update --name aks-preview
117127
118-
export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
119-
120-
echo "Creating resource group named $CLUSTER_NAME"
121-
az group create --name $CLUSTER_NAME -l $(LOCATION) -o table
128+
echo "Creating resource group named $(RESOURCE_GROUP)"
129+
az group create --name $(RESOURCE_GROUP) -l $(LOCATION) -o table
122130
123-
echo "Creating resource group named $CLUSTER_NAME"
131+
export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
132+
echo "Creating cluster named $CLUSTER_NAME"
124133
az aks create \
125-
--resource-group $CLUSTER_NAME \
134+
--resource-group $(RESOURCE_GROUP) \
126135
--name $CLUSTER_NAME \
127136
--generate-ssh-keys \
128137
--windows-admin-username e2eadmin \
129138
--windows-admin-password alpha@numeric!password2 \
130139
--network-plugin azure \
131140
--vm-set-type VirtualMachineScaleSets \
132141
--node-vm-size Standard_D4s_v3 \
133-
--node-count 1
134-
135-
# don't schedule anything on the linux system pool
136-
echo "Updating $CLUSTER_NAME to not schedule anything on linux pool..."
137-
az aks nodepool update \
138-
--cluster-name $CLUSTER_NAME \
139-
-g $CLUSTER_NAME \
140-
-n nodepool1 \
141-
--node-taints CriticalAddonsOnly=true:NoSchedule
142-
143-
echo "Adding Windows nodepool to $CLUSTER_NAME"
144-
az aks nodepool add \
145-
--resource-group $CLUSTER_NAME \
146-
--cluster-name $CLUSTER_NAME \
147-
--name awin22 \
148-
--os-type Windows \
149-
--os-sku Windows2022 \
150-
--node-vm-size Standard_D4s_v3 \
151142
--node-count 1 \
152143
--max-pods 100
153144
145+
if [[ $(PROFILE) == *ws22 ]]; then
146+
# don't schedule anything on the linux system pool
147+
echo "Updating $CLUSTER_NAME to not schedule anything on linux pool..."
148+
az aks nodepool update \
149+
--cluster-name $CLUSTER_NAME \
150+
-g $(RESOURCE_GROUP) \
151+
-n nodepool1 \
152+
--node-taints CriticalAddonsOnly=true:NoSchedule
153+
154+
echo "Adding Windows nodepool to $CLUSTER_NAME"
155+
az aks nodepool add \
156+
--resource-group $(RESOURCE_GROUP) \
157+
--cluster-name $CLUSTER_NAME \
158+
--name awin22 \
159+
--os-type Windows \
160+
--os-sku Windows2022 \
161+
--node-vm-size Standard_D4s_v3 \
162+
--node-count 1 \
163+
--max-pods 100
164+
fi
165+
154166
echo "Getting credentials to $CLUSTER_NAME"
155-
az aks get-credentials -g $CLUSTER_NAME -n $CLUSTER_NAME --overwrite-existing --file ./kubeconfig
167+
az aks get-credentials -g $(RESOURCE_GROUP) -n $CLUSTER_NAME --overwrite-existing --file ./kubeconfig
156168
mkdir -p ~/.kube/
157169
cp ./kubeconfig ~/.kube/config
158170
@@ -168,28 +180,42 @@ jobs:
168180
set -e
169181
170182
# deploy azure-npm
171-
cp $(Pipeline.Workspace)/s/npm/examples/windows/azure-npm.yaml azure-npm.yaml
172-
# set higher memory limit
183+
cp $(Pipeline.Workspace)/s/npm/azure-npm.yaml azure-npm.yaml
173184
sed -i 's/memory: 300Mi/memory: 1000Mi/g' azure-npm.yaml
174185
kubectl apply -f azure-npm.yaml
175186
187+
cp $(Pipeline.Workspace)/s/npm/examples/windows/azure-npm.yaml azure-npm-win.yaml
188+
# set higher memory limit
189+
sed -i 's/memory: 300Mi/memory: 1000Mi/g' azure-npm-win.yaml
190+
kubectl apply -f azure-npm-win.yaml
191+
176192
# swap azure-npm image with one built during run
193+
kubectl set image daemonset/azure-npm -n kube-system azure-npm=$IMAGE_REGISTRY/azure-npm:linux-amd64-$(TAG)
177194
kubectl set image daemonset/azure-npm-win -n kube-system azure-npm=$IMAGE_REGISTRY/azure-npm:windows-amd64-ltsc2022-$(TAG)
178195
179-
sleep 5s
196+
sleep 30s
180197
echo "waiting for NPM to start running..."
181-
kubectl wait --for=condition=Ready pod -l k8s-app=azure-npm -n kube-system --timeout=20m
198+
kubectl wait --for=condition=Ready pod -l k8s-app=azure-npm -n kube-system --timeout=15m || {
199+
kubectl describe pod -n kube-system -l k8s-app=azure-npm
200+
echo "##vso[task.logissue type=error]NPM failed to start running"
201+
exit 1
202+
}
182203
echo "sleep 3m to let NPM restart in case of bootup failure due to HNS errors"
183204
sleep 3m
184205
185206
kubectl get po -n kube-system -owide -A
186207
187-
echo "labeling Windows nodes for scale test"
188-
kubectl get node -o wide | grep "Windows Server 2022 Datacenter" | awk '{print $1}' | xargs -n 1 -I {} kubectl label node {} scale-test=true connectivity-test=true
208+
if [[ $(PROFILE) == *ws22 ]]; then
209+
echo "labeling Windows nodes for scale test"
210+
kubectl get node -o wide | grep "Windows Server 2022 Datacenter" | awk '{print $1}' | xargs -n 1 -I {} kubectl label node {} scale-test=true connectivity-test=true
211+
else
212+
echo "labeling Linux nodes for scale test"
213+
kubectl get node -o wide | grep "Ubuntu" | awk '{print $1}' | xargs -n 1 -I {} kubectl label node {} scale-test=true connectivity-test=true
214+
fi
189215
190216
export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
191217
echo "Showing cluster status for $CLUSTER_NAME"
192-
FQDN=`az aks show -n $CLUSTER_NAME -g $CLUSTER_NAME --query fqdn -o tsv`
218+
FQDN=`az aks show -n $CLUSTER_NAME -g $(RESOURCE_GROUP) --query fqdn -o tsv`
193219
echo "##vso[task.setvariable variable=FQDN]$FQDN"
194220
195221
- task: AzureCLI@2
@@ -202,15 +228,16 @@ jobs:
202228
condition: succeeded()
203229
inlineScript: |
204230
set -e
205-
mkdir -p $(System.DefaultWorkingDirectory)/$(RESOURCE_GROUP)-$(PROFILE)
231+
export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
232+
mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME
206233
./kwok --kubeconfig ~/.kube/config \
207234
--cidr=155.0.0.0/16 \
208235
--node-ip=155.0.0.1 \
209236
--manage-all-nodes=false \
210237
--manage-nodes-with-annotation-selector=kwok.x-k8s.io/node=fake \
211238
--manage-nodes-with-label-selector= \
212239
--disregard-status-with-annotation-selector=kwok.x-k8s.io/status=custom \
213-
--disregard-status-with-label-selector= > $(System.DefaultWorkingDirectory)/$(RESOURCE_GROUP)-$(PROFILE)/kwok-scale-up.log &
240+
--disregard-status-with-label-selector= > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/kwok-scale-up.log &
214241
kwok_pid=$!
215242
216243
# 20 kwok nodes
@@ -229,8 +256,8 @@ jobs:
229256
--max-real-pods-per-node=30 \
230257
--num-real-deployments=10 \
231258
--num-real-replicas=3 \
232-
--num-network-policies=50 \
233-
--num-unapplied-network-policies=50 \
259+
--num-network-policies=$(NUM_NETPOLS) \
260+
--num-unapplied-network-policies=$(NUM_NETPOLS) \
234261
--num-unique-labels-per-pod=2 \
235262
--num-unique-labels-per-deployment=2 \
236263
--num-shared-labels-per-pod=10
@@ -248,28 +275,30 @@ jobs:
248275
condition: succeeded()
249276
inlineScript: |
250277
set -e
251-
mkdir -p $(System.DefaultWorkingDirectory)/$(RESOURCE_GROUP)-$(PROFILE)
278+
export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
279+
mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME
252280
./kwok --kubeconfig ~/.kube/config \
253281
--cidr=155.0.0.0/16 \
254282
--node-ip=155.0.0.1 \
255283
--manage-all-nodes=false \
256284
--manage-nodes-with-annotation-selector=kwok.x-k8s.io/node=fake \
257285
--manage-nodes-with-label-selector= \
258286
--disregard-status-with-annotation-selector=kwok.x-k8s.io/status=custom \
259-
--disregard-status-with-label-selector= > $(System.DefaultWorkingDirectory)/$(RESOURCE_GROUP)-$(PROFILE)/kwok-bootup-latency.log &
287+
--disregard-status-with-label-selector= > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/kwok-bootup-latency.log &
260288
kwok_pid=$!
261289
262290
kubectl rollout restart -n kube-system ds azure-npm-win
263291
echo "sleeping 3 minutes to allow NPM pods to restart after scale-up..."
264292
sleep 3m
265293
266294
cd $(Pipeline.Workspace)/s/test/scale/connectivity/
295+
# notes for Windows:
267296
# initial connectivity should be established within 15 minutes of NPM restart (12 minute timeout since we already waited 3 minutes above)
268297
# adding new network policy to all 30 Pods should happen within 30 seconds
269298
set +e
270299
./test-connectivity.sh --kubectl-binary=$kubectlPath \
271300
--num-scale-pods-to-verify=all \
272-
--max-wait-for-initial-connectivity=$((12*60)) \
301+
--max-wait-for-initial-connectivity=$(INITIAL_CONNECTIVITY_TIMEOUT) \
273302
--max-wait-after-adding-netpol=30
274303
rc=$?
275304
if [[ $rc != 0 ]]; then
@@ -286,18 +315,19 @@ jobs:
286315
scriptType: "bash"
287316
scriptLocation: "inlineScript"
288317
failOnStderr: true
289-
# condition: succeeded()
318+
condition: succeeded()
290319
inlineScript: |
291320
set -e
292-
mkdir -p $(System.DefaultWorkingDirectory)/$(RESOURCE_GROUP)-$(PROFILE)
321+
export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
322+
mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME
293323
./kwok --kubeconfig ~/.kube/config \
294324
--cidr=155.0.0.0/16 \
295325
--node-ip=155.0.0.1 \
296326
--manage-all-nodes=false \
297327
--manage-nodes-with-annotation-selector=kwok.x-k8s.io/node=fake \
298328
--manage-nodes-with-label-selector= \
299329
--disregard-status-with-annotation-selector=kwok.x-k8s.io/status=custom \
300-
--disregard-status-with-label-selector= > $(System.DefaultWorkingDirectory)/$(RESOURCE_GROUP)-$(PROFILE)/kwok-crud.log &
330+
--disregard-status-with-label-selector= > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/kwok-crud.log &
301331
kwok_pid=$!
302332
303333
# will delete scale-test and connectivity-test namespaces from previous run
@@ -342,15 +372,16 @@ jobs:
342372
condition: succeeded()
343373
inlineScript: |
344374
set -e
345-
mkdir -p $(System.DefaultWorkingDirectory)/$(RESOURCE_GROUP)-$(PROFILE)
375+
export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
376+
mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME
346377
./kwok --kubeconfig ~/.kube/config \
347378
--cidr=155.0.0.0/16 \
348379
--node-ip=155.0.0.1 \
349380
--manage-all-nodes=false \
350381
--manage-nodes-with-annotation-selector=kwok.x-k8s.io/node=fake \
351382
--manage-nodes-with-label-selector= \
352383
--disregard-status-with-annotation-selector=kwok.x-k8s.io/status=custom \
353-
--disregard-status-with-label-selector= > $(System.DefaultWorkingDirectory)/$(RESOURCE_GROUP)-$(PROFILE)/kwok-crud-connectivity.log &
384+
--disregard-status-with-label-selector= > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/kwok-crud-connectivity.log &
354385
kwok_pid=$!
355386
356387
cd $(Pipeline.Workspace)/s/test/scale/connectivity/
@@ -371,14 +402,15 @@ jobs:
371402
372403
- bash: |
373404
export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE)
374-
cp cyclonus-$CLUSTER_NAME $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/cyclonus-$CLUSTER_NAME
375405
echo "Getting cluster state for $CLUSTER_NAME"
376406
mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME
377-
kubectl get pods -n kube-system | grep npm
378-
kubectl logs -n kube-system -l k8s-app=azure-npm --tail -1 --prefix > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/npm-logs_$(PROFILE).txt
379-
# capture any previous logs in case there was a crash
380-
npmPodList=`kubectl get pods -n kube-system | grep npm | awk '{print $1}'`
407+
kubectl get pods -n kube-system -owide | grep npm | grep -v kwok
408+
npmPodList=`kubectl get pods -n kube-system -owide | grep npm | grep -v kwok | awk '{print $1}'`
381409
for npmPod in $npmPodList; do
410+
logFile=$(System.DefaultWorkingDirectory)/$CLUSTER_NAME/npm-logs_$(PROFILE)-$npmPod.txt
411+
kubectl logs -n kube-system $npmPod > $logFile
412+
413+
# capture any previous logs in case there was a crash
382414
previousLogFile=$(System.DefaultWorkingDirectory)/$CLUSTER_NAME/previous-npm-logs_$(PROFILE).txt
383415
kubectl logs -n kube-system $npmPod -p > $previousLogFile
384416
if [[ $? -ne 0 ]]; then
@@ -413,6 +445,7 @@ jobs:
413445
azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION)
414446
scriptType: "bash"
415447
scriptLocation: "inlineScript"
448+
condition: succeeded()
416449
inlineScript: |
417450
echo Deleting $(RESOURCE_GROUP)
418451
az group delete -n $(RESOURCE_GROUP) --yes

test/scale/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ This saves us from:
4141

4242
Note: you must run `./test-scale.sh` first with `--num-network-policies=1` or more, and `--num-shared-labels-per-pod=3` or more.
4343
```
44-
./test-connectivity --num-scale-pods-to-verify=all \
44+
./test-connectivity.sh --num-scale-pods-to-verify=all \
4545
--max-wait-for-initial-connectivity=600 \
4646
--max-wait-after-adding-netpol=120
4747
```

test/scale/test-scale.sh

+27-9
Original file line numberDiff line numberDiff line change
@@ -261,13 +261,17 @@ wait_for_pods() {
261261
# wait for all pods to run
262262
minutesToWaitForRealPods=$(( 10 + $numRealPods / 250 ))
263263
set -x
264-
$KUBECTL $KUBECONFIG_ARG wait --for=condition=Ready pods -n scale-test -l is-real=true --all --timeout="${minutesToWaitForRealPods}m"
264+
if [[ $numRealPods -gt 0 ]]; then
265+
$KUBECTL $KUBECONFIG_ARG wait --for=condition=Ready pods -n scale-test -l is-real=true --all --timeout="${minutesToWaitForRealPods}m"
266+
fi
265267
set +x
266268

267269
# just make sure kwok pods are Running, not necessarily Ready (sometimes kwok pods have NodeNotReady even though the node is ready)
268270
minutesToWaitForKwokPods=$(( 1 + $numKwokPods / 500 ))
269271
set -x
270-
$KUBECTL $KUBECONFIG_ARG wait --for=condition=Initialized pods -n scale-test -l is-kwok=true --all --timeout="${minutesToWaitForKwokPods}m"
272+
if [[ $numKwokPods -gt 0 ]]; then
273+
$KUBECTL $KUBECONFIG_ARG wait --for=condition=Initialized pods -n scale-test -l is-kwok=true --all --timeout="${minutesToWaitForKwokPods}m"
274+
fi
271275
set +x
272276
}
273277

@@ -404,9 +408,15 @@ echo
404408

405409
set -x
406410
$KUBECTL $KUBECONFIG_ARG create ns scale-test
407-
$KUBECTL $KUBECONFIG_ARG apply -f generated/kwok-nodes/
408-
$KUBECTL $KUBECONFIG_ARG apply -f generated/deployments/real/
409-
$KUBECTL $KUBECONFIG_ARG apply -f generated/deployments/kwok/
411+
if [[ $numKwokNodes -gt 0 ]]; then
412+
$KUBECTL $KUBECONFIG_ARG apply -f generated/kwok-nodes/
413+
fi
414+
if [[ $numRealPods -gt 0 ]]; then
415+
$KUBECTL $KUBECONFIG_ARG apply -f generated/deployments/real/
416+
fi
417+
if [[ $numKwokPods -gt 0 ]]; then
418+
$KUBECTL $KUBECONFIG_ARG apply -f generated/deployments/kwok/
419+
fi
410420
set +x
411421

412422
add_shared_labels() {
@@ -441,8 +451,12 @@ if [[ $numUniqueLabelsPerPod -gt 0 ]]; then
441451
fi
442452

443453
set -x
444-
$KUBECTL $KUBECONFIG_ARG apply -f generated/networkpolicies/unapplied
445-
$KUBECTL $KUBECONFIG_ARG apply -f generated/networkpolicies/applied
454+
if [[ $numUnappliedNetworkPolicies -gt 0 ]]; then
455+
$KUBECTL $KUBECONFIG_ARG apply -f generated/networkpolicies/unapplied
456+
fi
457+
if [[ $numNetworkPolicies -gt 0 ]]; then
458+
$KUBECTL $KUBECONFIG_ARG apply -f generated/networkpolicies/applied
459+
fi
446460
set +x
447461

448462
wait_for_pods
@@ -470,8 +484,12 @@ if [[ $deleteNetpols == true ]]; then
470484

471485
echo "re-adding network policies. round $i/$deleteNetpolsTimes..."
472486
set -x
473-
$KUBECTL $KUBECONFIG_ARG apply -f generated/networkpolicies/unapplied
474-
$KUBECTL $KUBECONFIG_ARG apply -f generated/networkpolicies/applied
487+
if [[ $numUnappliedNetworkPolicies -gt 0 ]]; then
488+
$KUBECTL $KUBECONFIG_ARG apply -f generated/networkpolicies/unapplied
489+
fi
490+
if [[ $numNetworkPolicies -gt 0 ]]; then
491+
$KUBECTL $KUBECONFIG_ARG apply -f generated/networkpolicies/applied
492+
fi
475493
set +x
476494
echo "sleeping $deleteNetpolsInterval seconds after readding network policies (end of round $i/$deleteNetpolsTimes)..."
477495
sleep $deleteNetpolsInterval

0 commit comments

Comments
 (0)