Azure · nguyenluan3017 · Sep 23, 2025 · Sep 24, 2025 · Sep 24, 2025 · Sep 23, 2025
@@ -5,12 +5,13 @@ This guide provides comprehensive best practices and guidelines for creating, mo
 ## Development Best Practices
 
 ### 1. **Start Small and Iterate**
-- Begin with existing scenarios and modify parameters before creating new ones
+- Begin with existing scenarios and modify pipeline matrix parameters before creating new ones
 - Test with minimal resources first (e.g., 2-3 nodes) before scaling up
 - Use `new-pipeline-test.yml` for validation before final deployment
 - Validate configurations locally using instructions in [verify.md](verify.md)
 
 ### 2. **Reuse and Extend Components**
+- Check existing pipelines that match your requirements and reuse their components
 - Leverage existing engines and topologies whenever possible
 - Extend rather than recreate infrastructure modules
 - Follow established patterns from similar scenarios

@@ -29,6 +29,7 @@ stages:
           topology: <topology> # e.g. kperf, karpenter, kwok.  see available topology under /steps/topology
           engine: <engine> # e.g. clusterloader2, crud, iperf3, kperf. see available engines under /steps/engine
           matrix: # list of test parameters to customize the provisioned resources and also used for running the python code in the engine step
+            # Only add parameters that are used in the engine or topology steps
             <case-name>:
               <key1>: <value1>
               <key2>: <value2>

@@ -16,7 +16,7 @@ Telescope supports multiple approaches to create and modify test scenarios, each
 
 ### 1. Expand Existing Scenario
 
-**Use Case**: Create variations of existing tests with different variants (e.g., node count, k8s version, capacity type, OS type)
+**Use Case-1**: Create variations of existing tests with different variants (e.g., node count, k8s version, capacity type, OS type)
 
 **Steps**:
 ```bash
@@ -55,7 +55,30 @@ scenarios/perf-eval/<scenario-name>/terraform-inputs/aws-windows.tfvars
             scale_up_timeout: "90m"
             scale_down_timeout: "90m"
 ```
-
+**Use Case-2**: Add new variants to existing scenarios for different scale, k8s version, capacity type, OS type if they support matrix parameters. Instead of creating new tfvars files, you can leverage matrix parameters to define different variants.
+```yaml
+- stage: azure_eastus2_xlarge_scale_spot
+  condition: |
+    or(
+      eq(variables['Build.CronSchedule.DisplayName'], 'Weekly XLarge Scale Spot'),
+      eq(variables['Build.Reason'], 'Manual')
+    )
+  jobs:
+    - template: /jobs/competitive-test.yml
+      parameters:
+        cloud: azure
+        regions:
+          - eastus2
+        terraform_input_file_mapping:
+          - eastus2: "scenarios/perf-eval/cluster-autoscaler/terraform-inputs/azure-5k.tfvars"
+        matrix:
+          xlarge-scale-spot:
+            node_count: 2001
+            pod_count: 2001
+            scale_up_timeout: "90m"
+            scale_down_timeout: "90m"
+            capacity_type: spot
+```
 ### 2. Create New Scenario with Custom Infrastructure
 
 **Use Case**: Implementing a completely new test scenario with unique infrastructure requirements
@@ -236,14 +259,14 @@ steps:
 - [ ] Configure topology
 - [ ] Create pipeline definition
 - [ ] Test in `new-pipeline-test.yml`
-- [ ] Validate locally using `verify.md`
+- [ ] Validate locally using `verify.md` based on the modules generated as part of changes
 - [ ] Move to appropriate category directory
 
 ### For Modifications:
 - [ ] Identify existing components to modify
 - [ ] Create new variable files if needed
 - [ ] Test in `new-pipeline-test.yml` with E2E testing guide
-- [ ] Validate locally using `verify.md`
+- [ ] Validate locally using `verify.md` based on the modules generated as part of changes
 - [ ] Update pipeline matrix parameters
 - [ ] Test parameter variations
 - [ ] Update documentation

@@ -59,7 +59,7 @@ def calculate_cpu_request_for_clusterloader2(node_label_selector, node_count, po
     cpu_request = int(cpu_request * 0.95)
     return cpu_request
 
-def override_config_clusterloader2(cpu_per_node, node_count, pod_count, scale_up_timeout, scale_down_timeout, loop_count, node_label_selector, node_selector, override_file, warmup_deployment, cl2_config_dir):
+def override_config_clusterloader2(cpu_per_node, node_count, pod_count, scale_up_timeout, scale_down_timeout, loop_count, node_label_selector, node_selector, override_file, warmup_deployment, cl2_config_dir, os_type="linux"):
     logger.info(f"CPU per node: {cpu_per_node}")
     desired_node_count = 1
     if warmup_deployment in ["true", "True"]:
@@ -83,6 +83,7 @@ def override_config_clusterloader2(cpu_per_node, node_count, pod_count, scale_up
         file.write(f"CL2_LOOP_COUNT: {loop_count}\n")
         file.write(f"CL2_NODE_LABEL_SELECTOR: {node_label_selector}\n")
         file.write(f"CL2_NODE_SELECTOR: \"{node_selector}\"\n")
+        file.write(f"CL2_OS_TYPE: {os_type}\n")
 
     file.close()
 
@@ -196,6 +197,7 @@ def main():
     parser_override.add_argument("cl2_override_file", type=str, help="Path to the overrides of CL2 config file")
     parser_override.add_argument("warmup_deployment", type=str, help="Warmup deployment to get the cpu request")
     parser_override.add_argument("cl2_config_dir", type=str, help="Path to the CL2 config directory")
+    parser_override.add_argument("--os_type", type=str, choices=["linux", "windows"], default="linux", help="Operating system type for the node pools")
 
     # Sub-command for execute_clusterloader2
     parser_execute = subparsers.add_parser("execute", help="Execute scale up operation")
@@ -220,7 +222,7 @@ def main():
     args = parser.parse_args()
 
     if args.command == "override":
-        override_config_clusterloader2(args.cpu_per_node, args.node_count, args.pod_count, args.scale_up_timeout, args.scale_down_timeout, args.loop_count, args.node_label_selector, args.node_selector, args.cl2_override_file, args.warmup_deployment, args.cl2_config_dir)
+        override_config_clusterloader2(args.cpu_per_node, args.node_count, args.pod_count, args.scale_up_timeout, args.scale_down_timeout, args.loop_count, args.node_label_selector, args.node_selector, args.cl2_override_file, args.warmup_deployment, args.cl2_config_dir, args.os_type)
     elif args.command == "execute":
         execute_clusterloader2(args.cl2_image, args.cl2_config_dir, args.cl2_report_dir, args.kubeconfig, args.provider)
     elif args.command == "collect":

@@ -12,6 +12,7 @@
 {{$refreshInterval := DefaultParam .CL2_REFRESH_INTERVAL "5s"}}
 {{$loopCount := DefaultParam .CL2_LOOP_COUNT 1}}
 {{$coolDownTime := DefaultParam .CL2_COOLDOWN_TIME "120s"}}
+{{$osType := DefaultParam .CL2_OS_TYPE "linux"}}
 {{$percentilesList := StringSplit "0.50,0.70,0.90,0.99,1.00"}}
 {{$countErrorMargin := MultiplyInt .CL2_DEPLOYMENT_SIZE 0.01}}
 
@@ -44,6 +45,7 @@ steps:
         Replicas: {{$deploymentSize}}
         CPUperJob: {{$deploymentCpu}}
         NodeSelector: {{ (StructuralData $nodeSelector) }}
+        OSType: {{$osType}}
 - name: Measure nodes and pods scale up {{$i}}
   measurements:
   - Identifier: WaitForRunningPodsUp {{$i}}
@@ -87,6 +89,7 @@ steps:
       templateFillMap:
         Replicas: {{$deploymentSize}}
         CPUperJob: {{$deploymentCpu}}
+        OSType: {{$osType}}
 - name: Measure nodes and pods scale down {{$i}}
   measurements:
   - Identifier: WaitForRunningPodsDown {{$i}}

@@ -1,3 +1,5 @@
+{{$OSType := DefaultParam .OSType "linux"}}
+
 apiVersion: apps/v1
 kind: Deployment
 metadata:
@@ -19,15 +21,26 @@ spec:
         {{ (StructuralData $key ) }}: {{ $value }}
       {{- end }}
       containers:
-        - name: nginx
+        - name: load-generator
+          {{if eq $OSType "windows"}}
+          image: mcr.microsoft.com/windows/nanoserver:ltsc2022
+          {{else}}
           image: mcr.microsoft.com/cbl-mariner/base/nginx:1
+          {{end}}
           resources:
             requests:
               cpu: {{.CPUperJob}}
             limits:
               cpu: {{.CPUperJob}}
+          {{if eq $OSType "windows"}}
+          command:
+          - "cmd.exe"
+          - "/c"
+          - "echo %date% %time% && ping -n 6001 127.0.0.1 > nul"
+          {{else}}
           command:
             - "/bin/bash"
             - "-c"
             - runtime="1 minute"; endtime=$(date -ud "$runtime" +%s); while [[ $(date -u +%s) -le $endtime ]]; do echo $(date); sleep 6000; done
+          {{end}}
       restartPolicy: Always
@@ -69,6 +69,14 @@ def validate(self, **kwargs):
     def collect(self, **kwargs) -> str:
         pass
 
+    @property
+    def parser(self) -> argparse.ArgumentParser:
+        return self._parser
+
+    @property
+    def command_parser(self) -> argparse.ArgumentParser:
+        return self._command_parser
+
     def get_measurement(self, file_path):
         file_name = os.path.basename(file_path)
         for file_prefix, measurement in POD_STARTUP_LATENCY_FILE_PREFIX_MEASUREMENT_MAP.items():

@@ -101,7 +101,7 @@ def test_override_config_clusterloader2(self, mock_open, mock_logger, mock_warmu
         # Mock the CPU request calculation
         mock_calculate_cpu_request.return_value = 1900
 
-        override_config_clusterloader2(2, 100, 1000, '5m', '5m', 1, 'autoscaler = true', '{autoscaler : true}', 'override_file', 'false', '/mock/path')
+        override_config_clusterloader2(2, 100, 1000, '5m', '5m', 1, 'autoscaler = true', '{autoscaler : true}', 'override_file', 'false', '/mock/path', 'linux')
         mock_open.assert_any_call('override_file', 'w', encoding='utf-8')
         handle = mock_open()
         handle.write.assert_any_call('CL2_DEPLOYMENT_CPU: 1900m\n')
@@ -114,14 +114,15 @@ def test_override_config_clusterloader2(self, mock_open, mock_logger, mock_warmu
         handle.write.assert_any_call('CL2_LOOP_COUNT: 1\n')
         handle.write.assert_any_call('CL2_NODE_LABEL_SELECTOR: autoscaler = true\n')
         handle.write.assert_any_call('CL2_NODE_SELECTOR: "{autoscaler : true}"\n')
+        handle.write.assert_any_call('CL2_OS_TYPE: linux\n')
 
         mock_logger.info.assert_any_call("CPU per node: 2")
         mock_logger.info.assert_any_call("Total number of nodes: 100, total number of pods: 1000")
         mock_logger.info.assert_any_call("CPU request for each pod: 1900m")
 
         # Test with warmup deployment true
         mock_warmup.retun_value = None
-        override_config_clusterloader2(2, 100, 1000, '5m', '5m', 1, 'autoscaler = true', '{autoscaler : true}', 'override_file', 'true', '/mock/path')
+        override_config_clusterloader2(2, 100, 1000, '5m', '5m', 1, 'autoscaler = true', '{autoscaler : true}', 'override_file', 'true', '/mock/path', 'windows')
         mock_open.assert_any_call('override_file', 'w', encoding='utf-8')
         handle = mock_open()
         handle.write.assert_any_call('CL2_DEPLOYMENT_CPU: 1900m\n')
@@ -134,6 +135,18 @@ def test_override_config_clusterloader2(self, mock_open, mock_logger, mock_warmu
         handle.write.assert_any_call('CL2_LOOP_COUNT: 1\n')
         handle.write.assert_any_call('CL2_NODE_LABEL_SELECTOR: autoscaler = true\n')
         handle.write.assert_any_call('CL2_NODE_SELECTOR: "{autoscaler : true}"\n')
+        handle.write.assert_any_call('CL2_OS_TYPE: windows\n')
+        handle = mock_open()
+        handle.write.assert_any_call('CL2_DEPLOYMENT_CPU: 1900m\n')
+        handle.write.assert_any_call('CL2_MIN_NODE_COUNT: 100\n')
+        handle.write.assert_any_call('CL2_MAX_NODE_COUNT: 110\n')
+        handle.write.assert_any_call('CL2_DESIRED_NODE_COUNT: 0\n')
+        handle.write.assert_any_call('CL2_DEPLOYMENT_SIZE: 1000\n')
+        handle.write.assert_any_call('CL2_SCALE_UP_TIMEOUT: 5m\n')
+        handle.write.assert_any_call('CL2_SCALE_DOWN_TIMEOUT: 5m\n')
+        handle.write.assert_any_call('CL2_LOOP_COUNT: 1\n')
+        handle.write.assert_any_call('CL2_NODE_LABEL_SELECTOR: autoscaler = true\n')
+        handle.write.assert_any_call('CL2_NODE_SELECTOR: "{autoscaler : true}"\n')
 
     @patch('clusterloader2.autoscale.autoscale.run_cl2_command')
     def test_execute_clusterloader2(self, mock_run_cl2_command):
@@ -214,7 +227,7 @@ def test_override_command(self, mock_override):
             mock_override.assert_called_once_with(
                 4, 3, 200, '10m', '5m', 2,
                 'nodepool=default', 'env=prod',
-                'override.yaml', 'warmup-deploy', 'config-dir'
+                'override.yaml', 'warmup-deploy', 'config-dir', 'linux'
             )
 
     @patch('clusterloader2.autoscale.autoscale.execute_clusterloader2')