diff --git a/src/nvme-conversion/DEVELOPMENT.md b/src/nvme-conversion/DEVELOPMENT.md new file mode 100755 index 00000000000..f297f94817a --- /dev/null +++ b/src/nvme-conversion/DEVELOPMENT.md @@ -0,0 +1,255 @@ +# NVMe Conversion Extension — Development Guide + +## Origin & Context + +This extension is a modernized Azure CLI port of the PowerShell script +[Azure-NVMe-Conversion.ps1](https://github.com/Azure/SAP-on-Azure-Scripts-and-Utilities/tree/main/Azure-NVMe-Utils) +from the SAP-on-Azure-Scripts-and-Utilities repository. The original script (1,323 lines of PowerShell) +was converted to a Python-based Azure CLI extension following the +[Azure CLI extension authoring guidelines](https://github.com/Azure/azure-cli/blob/dev/doc/extensions/authoring.md). + +## Reference Documentation + +### Azure CLI Extension Development + +| Resource | URL | +|---|---| +| Extension authoring guide | https://github.com/Azure/azure-cli/blob/dev/doc/extensions/authoring.md | +| Extension metadata spec | https://github.com/Azure/azure-cli/blob/dev/doc/extensions/metadata.md | +| Extension versioning guidelines | https://github.com/Azure/azure-cli/blob/dev/doc/extensions/versioning_guidelines.md | +| Extension summary guidelines | https://github.com/Azure/azure-cli/blob/dev/doc/extensions/extension_summary_guidelines.md | +| Extension FAQ | https://github.com/Azure/azure-cli/blob/dev/doc/extensions/faq.md | +| Authoring commands guide | https://github.com/Azure/azure-cli/blob/main/doc/authoring_command_modules/authoring_commands.md | +| azdev CLI dev tools | https://github.com/Azure/azure-cli-dev-tools | +| azure-cli-extensions repo | https://github.com/Azure/azure-cli-extensions | + +### Azure NVMe & Disk Controller + +| Resource | URL | +|---|---| +| Original PowerShell script | https://github.com/Azure/SAP-on-Azure-Scripts-and-Utilities/tree/main/Azure-NVMe-Utils | +| Azure NVMe overview | https://learn.microsoft.com/azure/virtual-machines/enable-nvme-interface | +| VM sizes with NVMe support | https://learn.microsoft.com/azure/virtual-machines/sizes/overview | +| Disk controller types API | `DiskControllerTypes` capability in `resource_skus.list()` | +| az vm update --disk-controller-type | https://learn.microsoft.com/cli/azure/vm#az-vm-update (Preview) | +| azure-vm-utils (udev rules) | https://github.com/Azure/azure-vm-utils | + +### Python SDK + +| Package | Used for | +|---|---| +| `azure-mgmt-compute` (bundled in CLI core) | VM, disk, extension, SKU, RunCommand operations | +| `azure.cli.core.azclierror` | Semantic error types (ValidationError, ResourceNotFoundError, InvalidArgumentValueError) | +| `azure.cli.core.commands.parameters` | `get_enum_type` for --controller-type | +| `azure.cli.testsdk` | ScenarioTest base class for live tests | + +## Architecture + +``` +azext_nvme_conversion/ +├── __init__.py # CommandsLoader — entry point +├── commands.py # Command registration + table formatters +├── _params.py # Argument definitions + validators +├── _help.py # Help text (YAML-like format) +├── _validators.py # Parameter validators (vm_size, sleep_seconds) +├── _format.py # Table output formatters for -o table +├── _client_factory.py # ComputeManagementClient factory +├── custom.py # Core orchestration logic (convert + check) +├── _windows_checks.py # Windows: version check + stornvme driver +├── _linux_checks.py # Linux: RunCommand wrapper +├── _linux_script.py # Embedded 500-line bash script +├── azext_metadata.json # Extension metadata (preview, min CLI version) +└── tests/latest/ + └── test_nvme_conversion.py # 56 unit + integration tests +``` + +### Data Flow + +``` +User runs: az nvme-conversion convert -g myRG -n myVM --start-vm + + ┌─ custom.py ────────────────────────────────────────────────────────────────────────────┐ + │ [1/8] _validate_vm() → ComputeClient.virtual_machines.get() │ + │ [2/8] _resolve_vm_size() → ComputeClient.resource_skus.list() │ + │ [3/8] _check_ade_extension() → ComputeClient.virtual_machine_extensions.get() │ + │ _check_vm_generation() → ComputeClient.disks.get() │ + │ _check_vm_power_state()→ ComputeClient.virtual_machines.instance_view() │ + │ [4/8] _validate_sku() → ComputeClient.resource_skus.list() │ + │ [5/8] _prepare_os() → _windows_checks.py or _linux_checks.py │ + │ └─ RunCommand → ComputeClient.virtual_machines.begin_run_command() │ + │ [6/8] _stop_vm() → ComputeClient.virtual_machines.begin_deallocate() │ + │ [7/8] _update_disk_capabilities() → ComputeClient.disks.begin_update() │ + │ _update_vm() → ComputeClient.virtual_machines.begin_create_or_update() │ + │ [8/8] _start_vm() → ComputeClient.virtual_machines.begin_start() │ + └────────────────────────────────────────────────────────────────────────────────────────┘ +``` + +## Development Setup + +### Prerequisites + +- Python 3.10+ (CLI supports 3.10–3.13) +- Azure CLI installed (`az --version`) +- WSL (for Linux testing) or native Linux + +### Quick Start (without azdev) + +```bash +cd src/nvme-conversion + +# Build wheel +python3 setup.py bdist_wheel + +# Install extension +az extension remove -n nvme-conversion 2>/dev/null +az extension add --source dist/nvme_conversion-1.0.0b1-py3-none-any.whl --yes + +# Run tests +python3 -m pytest azext_nvme_conversion/tests/latest/test_nvme_conversion.py -v \ + -k "not NvmeConversionCheckTest and not NvmeConversionConvertTest" + +# Lint +python3 -m flake8 --max-line-length=120 --ignore=E501,W503,W504,C901 \ + --exclude=_linux_script.py azext_nvme_conversion/*.py +``` + +### Full Setup (with azdev) + +```bash +# Create virtual environment +python3 -m venv .venv-azdev +source .venv-azdev/bin/activate + +# Install azdev +pip install azdev + +# Setup with extension repo +azdev setup -r -e nvme-conversion + +# Validate (run before every PR) +azdev style nvme-conversion +azdev test nvme-conversion +azdev linter nvme-conversion +``` + +## Key Design Decisions + +### Why custom commands (not AAZ)? + +The extension orchestrates **8 sequential Azure API calls** (VM get → extension check → SKU check → +RunCommand → VM stop → disk PATCH → VM update → VM start). AAZ is designed for single-resource +CRUD operations. Custom orchestration with `custom.py` is required. + +### SDK vs Raw REST + +The original PowerShell script uses `Invoke-RestMethod` with a bearer token for the disk PATCH. +We use `ComputeManagementClient.disks.begin_update()` which provides automatic auth, retry, +and error handling. + +### Error types + +| Error type | When used | +|---|---| +| `ResourceNotFoundError` | VM not found | +| `ValidationError` | Gen1, ADE installed, OS not ready, VM not deallocated | +| `InvalidArgumentValueError` | Bad SKU, SKU doesn't support target controller, invalid --vm-size | + +### SKU DiskControllerTypes behavior + +| `DiskControllerTypes` value | Meaning | NVMe target | SCSI target | +|---|---|---|---| +| `"SCSI, NVMe"` | Both supported | OK | OK | +| `"NVMe"` | NVMe only (v6+) | OK | BLOCKED | +| `"SCSI"` | SCSI only | BLOCKED | OK | +| absent | SCSI only (old SKUs) | BLOCKED | OK | + +### Udev rules strategy + +When `azure-vm-utils` is not installed and `--fix-os` is used, the extension deploys a fallback +udev rule (`99-azure-nvme-fallback.rules`) that provides: +- `io_timeout=240s` for NVMe remote disks +- `/dev/disk/azure/root` and `/dev/disk/azure/data/by-lun/N` symlinks + +It does NOT provide `by-name`, `by-serial`, `by-index` (those require the `azure-nvme-id` binary +from `azure-vm-utils`). The fallback rule is numbered 99 so `80-azure-disk.rules` takes precedence +when `azure-vm-utils` is later installed. + +## Testing + +### Unit tests (56 tests) + +```bash +python3 -m pytest azext_nvme_conversion/tests/latest/test_nvme_conversion.py -v \ + -k "not NvmeConversionCheckTest and not NvmeConversionConvertTest" +``` + +Test categories: +- **NvmeConversionUnitTests** (16): OS detection, controller resolution, Windows version, VM generation +- **WindowsChecksUnitTests** (5): RunCommand result parsing, fix mode +- **LinuxChecksUnitTests** (5): RunCommand result parsing, fix/dry-run modes +- **ValidatorUnitTests** (6): vm_size format, sleep_seconds range +- **ResolveVmSizeTests** (8): Auto-size resolution, SKU capability handling +- **ConvertEndToEndTests** (11): Full mocked convert flow +- **CheckEndToEndTests** (6): Full mocked check flow + +### Live tests + +See `TESTING.md` for the full test tracker. Live tests were run against: +- Ubuntu 24.04, RHEL 9 LVM, SLES 15 SP6, Azure Linux 4, Windows Server 2022, Windows Server 2019 +- Both WSL and Windows Command Prompt CLIs +- SCSI→NVMe and NVMe→SCSI conversions +- Edge cases: already-on-target, dry-run, --no-wait, --ignore-os-check +- Batch conversion: 13 RHEL Gen2 VMs across 2 resource groups + +### azdev validation + +```bash +source ~/azdev-env/bin/activate +azdev style nvme-conversion # Pylint PASSED, Flake8 PASSED +azdev test nvme-conversion # 56 passed, 2 skipped +azdev linter nvme-conversion # Requires ssh extension fix (unrelated) +``` + +## Versioning + +Following [Azure CLI extension versioning guidelines](https://github.com/Azure/azure-cli/blob/dev/doc/extensions/versioning_guidelines.md): + +- Current version: `1.0.0b1` (initial preview) +- Preview versions use `X.Y.ZbN` format +- `azext_metadata.json` sets `azext.isPreview: true` + +## Publishing + +For extensions hosted in `Azure/azure-cli-extensions`, publishing is automatic: +1. Merge code to `main` branch +2. The CI detects the version via `python setup.py --version` +3. If version is new, CI builds the wheel, uploads it, and updates `src/index.json` +4. A PR is auto-created for the index update +5. After merge, the extension is available via `az extension add -n nvme-conversion` + +Manual `index.json` updates are NOT required. + +## Files Reference + +| File | Purpose | Lines | +|---|---|---| +| `setup.py` | Package metadata, version, dependencies | ~55 | +| `setup.cfg` | Empty (convention) | 1 | +| `README.md` | User-facing documentation | ~45 | +| `HISTORY.rst` | Release history (reStructuredText, repo convention) | ~12 | +| `ROADMAP.md` | Development roadmap with checkboxes | ~300 | +| `TESTING.md` | Live test tracker with results | ~100 | +| `DEVELOPMENT.md` | This file | ~200 | +| `azext_metadata.json` | Extension metadata (preview, min CLI version) | 4 | +| `__init__.py` | CommandsLoader entry point | ~30 | +| `commands.py` | Command registration + formatters | ~15 | +| `_params.py` | Argument definitions | ~50 | +| `_help.py` | Help text with examples | ~80 | +| `_validators.py` | Parameter validators | ~30 | +| `_format.py` | Table output formatters | ~40 | +| `_client_factory.py` | SDK client factory | ~10 | +| `custom.py` | Core orchestration logic | ~560 | +| `_windows_checks.py` | Windows OS checks/fixes | ~95 | +| `_linux_checks.py` | Linux OS checks/fixes | ~80 | +| `_linux_script.py` | Embedded bash script (~500 lines) | ~520 | +| `tests/.../test_nvme_conversion.py` | 56 unit + integration tests | ~670 | diff --git a/src/nvme-conversion/HISTORY.rst b/src/nvme-conversion/HISTORY.rst new file mode 100755 index 00000000000..3e1b3d8f273 --- /dev/null +++ b/src/nvme-conversion/HISTORY.rst @@ -0,0 +1,14 @@ +.. :changelog: + +Release History +=============== + +1.0.0b1 +++++++ +* Initial preview release +* Convert VMs between SCSI and NVMe disk controllers +* Pre-flight validation checks (SKU, OS, generation, ADE) +* OS preparation for Windows (stornvme driver) and Linux (initrd, fstab, io_timeout) +* Dry-run mode for Linux VMs +* Fallback udev rules when azure-vm-utils is not installed +* Auto-detect controller type; reuse current VM size if compatible diff --git a/src/nvme-conversion/README.md b/src/nvme-conversion/README.md new file mode 100755 index 00000000000..be9fb76ae7b --- /dev/null +++ b/src/nvme-conversion/README.md @@ -0,0 +1,60 @@ +# Azure NVMe Conversion Extension + +Convert Azure Virtual Machines between SCSI and NVMe disk controllers. + +For architecture, design decisions, and development setup, see [DEVELOPMENT.md](DEVELOPMENT.md). + +## Why use this extension? + +The built-in `az vm update --disk-controller-type` only sets the controller property on the VM — one step out of eight required for a safe conversion. This extension orchestrates the full lifecycle: + +| Step | `az vm update` | `az nvme-conversion convert` | +|---|---|---| +| Pre-flight validation (Gen2, ADE, SKU) | Manual | Automatic | +| OS readiness check & fix (drivers, grub, fstab) | Manual RunCommand per distro | `--fix-os` | +| OS disk `supportedCapabilities` update | Separate `az disk update` | Automatic | +| VM deallocate | Separate `az vm deallocate` | Automatic (skips if already stopped) | +| VM resize + controller change | `az vm update` | Combined in one step | +| VM start | Separate `az vm start` | `--start-vm` | +| Rollback instructions | None | Provided in output | +| Dry-run mode | None | `--dry-run` | + +## Usage + +```bash +# Convert a VM (auto-detects controller type, keeps current size if it supports both) +az nvme-conversion convert --resource-group myRG --vm-name myVM --start-vm + +# Convert and change VM size +az nvme-conversion convert --resource-group myRG --vm-name myVM --vm-size Standard_E4bds_v5 --start-vm + +# Check VM readiness without making changes +az nvme-conversion check --resource-group myRG --vm-name myVM + +# Explicitly convert to SCSI with a different size +az nvme-conversion convert --resource-group myRG --vm-name myVM --controller-type SCSI --vm-size Standard_E4s_v5 --start-vm + +# Dry-run for Linux VMs (stage changes without applying) +az nvme-conversion convert --resource-group myRG --vm-name myVM --dry-run + +# Auto-fix OS settings during conversion +az nvme-conversion convert --resource-group myRG --vm-name myVM --fix-os --start-vm --yes +``` + +## Parameters + +| Parameter | Description | Required | +|---|---|---| +| `--resource-group -g` | Resource group of the VM | Yes | +| `--vm-name -n` | VM name | Yes | +| `--controller-type` | Target controller type (NVMe or SCSI). Auto-detected if omitted | No | +| `--vm-size` | Target VM size/SKU. If omitted, keeps current size when it supports the target controller | No | +| `--start-vm` | Start VM after conversion | No | +| `--fix-os` | Auto-fix OS settings for NVMe readiness | No | +| `--ignore-sku-check` | Skip SKU validation | No | +| `--ignore-os-check` | Skip OS readiness check | No | +| `--ignore-windows-version-check` | Skip Windows version check | No | +| `--dry-run` | Linux only: stage changes without applying | No | +| `--sleep-seconds` | Delay before starting VM (default: 15) | No | +| `--yes -y` | Skip confirmation prompts | No | +| `--no-wait` | Do not wait for VM start to complete | No | diff --git a/src/nvme-conversion/ROADMAP.md b/src/nvme-conversion/ROADMAP.md new file mode 100755 index 00000000000..124de1f9255 --- /dev/null +++ b/src/nvme-conversion/ROADMAP.md @@ -0,0 +1,389 @@ +# Azure NVMe Conversion Extension — Analysis & Roadmap + +## 1. Script Analysis + +### Source Script +`Azure-NVMe-Conversion.ps1` — 1,323 lines of PowerShell that orchestrates the conversion of Azure VMs between SCSI and NVMe disk controllers. + +### What the Script Does (Logical Flow) + +``` +┌─────────────────────────────────────────────────┐ +│ 1. VALIDATION PHASE │ +│ ├─ Check Azure context / authentication │ +│ ├─ Get VM, verify it exists │ +│ ├─ Check Azure Disk Encryption (block ADE) │ +│ ├─ Check VM power state (must be running) │ +│ ├─ Detect OS (Windows / Linux) │ +│ ├─ Check Windows version (>= 2019) │ +│ ├─ Check VM generation (must be Gen2) │ +│ └─ Check current controller (SCSI vs NVMe) │ +│ │ +│ 2. SKU VALIDATION PHASE │ +│ ├─ Get compute resource SKUs for region │ +│ ├─ Validate target SKU exists in zone │ +│ ├─ Check resource disk compatibility │ +│ └─ Verify target SKU supports NVMe/SCSI │ +│ │ +│ 3. OS PREPARATION PHASE │ +│ ├─ Windows: Check stornvme driver via │ +│ │ RunCommand, optionally fix │ +│ └─ Linux: Run embedded bash script via │ +│ RunCommand that: │ +│ ├─ Checks NVMe driver in initrd/initramfs │ +│ ├─ Checks nvme_core.io_timeout parameter │ +│ ├─ Checks /etc/fstab for deprecated devs │ +│ ├─ Optionally fixes all issues │ +│ └─ Supports dry-run staging │ +│ │ +│ 4. CONVERSION PHASE │ +│ ├─ Shutdown VM (Stop-AzVM) │ +│ ├─ Update OS disk capabilities via REST API │ +│ │ (PATCH disk supportedCapabilities) │ +│ ├─ Update VM size + DiskControllerType │ +│ └─ Optionally start VM │ +│ │ +│ 5. POST-CONVERSION │ +│ └─ Output revert instructions │ +└─────────────────────────────────────────────────┘ +``` + +### Key Azure SDK Operations Required + +| PS Script Operation | Python SDK Equivalent | +|---|---| +| `Get-AzContext` | `azure.cli.core` (handled by CLI framework) | +| `Get-AzVM` | `ComputeManagementClient.virtual_machines.get()` | +| `Get-AzVMExtension` | `ComputeManagementClient.virtual_machine_extensions.get()` | +| `Get-AzVM -Status` | `ComputeManagementClient.virtual_machines.instance_view()` | +| `Get-AzDisk` | `ComputeManagementClient.disks.get()` | +| `Get-AzComputeResourceSku` | `ComputeManagementClient.resource_skus.list()` | +| `Invoke-AzVMRunCommand` | `ComputeManagementClient.virtual_machines.begin_run_command()` | +| `Stop-AzVM` | `ComputeManagementClient.virtual_machines.begin_deallocate()` | +| `Start-AzVM` | `ComputeManagementClient.virtual_machines.begin_start()` | +| `Update-AzVM` | `ComputeManagementClient.virtual_machines.begin_create_or_update()` | +| `Invoke-RestMethod` (disk PATCH) | `ComputeManagementClient.disks.begin_update()` | + +--- + +## 2. Extension Design + +### Extension Name +`nvme-conversion` + +### Package Name +`azext_nvme_conversion` + +### Command Group & Commands + +``` +az nvme-conversion +├── convert # Full conversion (SCSI→NVMe or NVMe→SCSI) +├── check # Pre-flight validation only (no changes) +└── revert # Convenience: convert back to original controller +``` + +### Command: `az nvme-conversion convert` + +| Parameter | CLI Flag | Type | Required | Description | +|---|---|---|---|---| +| resource_group | `--resource-group -g` | str | Yes | Resource group of the VM | +| vm_name | `--vm-name -n` | str | Yes | VM name | +| new_controller_type | `--controller-type` | str (NVMe/SCSI) | No (default: NVMe) | Target controller type | +| vm_size | `--vm-size` | str | Yes | Target VM size/SKU | +| start_vm | `--start-vm` | bool | No | Start VM after conversion | +| fix_os | `--fix-os` | bool | No | Auto-fix OS settings | +| ignore_sku_check | `--ignore-sku-check` | bool | No | Skip SKU validation | +| ignore_os_check | `--ignore-os-check` | bool | No | Skip OS readiness check | +| ignore_windows_version | `--ignore-windows-version-check` | bool | No | Skip Windows version check | +| dry_run | `--dry-run` | bool | No | Linux only: stage changes without applying | +| sleep_seconds | `--sleep-seconds` | int | No (default: 15) | Delay before starting VM | +| yes | `--yes -y` | bool | No | Skip confirmation prompts | + +### Command: `az nvme-conversion check` + +Same parameters as `convert` minus `--start-vm`, `--fix-os`, `--sleep-seconds`. Runs all validation steps and OS checks without making changes. + +### Command: `az nvme-conversion revert` + +| Parameter | CLI Flag | Type | Required | Description | +|---|---|---|---|---| +| resource_group | `--resource-group -g` | str | Yes | Resource group of the VM | +| vm_name | `--vm-name -n` | str | Yes | VM name | +| original_vm_size | `--original-vm-size` | str | Yes | Original VM size to revert to | +| start_vm | `--start-vm` | bool | No | Start VM after revert | + +--- + +## 3. Directory Structure + +``` +src/nvme-conversion/ +├── setup.py +├── setup.cfg +├── README.md +├── HISTORY.rst +└── azext_nvme_conversion/ + ├── __init__.py # CommandsLoader + ├── commands.py # Command registration + ├── _params.py # Argument definitions + ├── _help.py # Help text + ├── _validators.py # Parameter validators + ├── custom.py # Core conversion logic + ├── _client_factory.py # SDK client factory + ├── _vm_operations.py # VM operations (get, stop, start, update) + ├── _sku_operations.py # SKU validation logic + ├── _os_preparation.py # OS checks & fixes (Windows + Linux) + ├── _disk_operations.py # Disk update operations + ├── azext_metadata.json # Extension metadata + ├── scripts/ + │ └── linux_nvme_check.sh # Extracted Linux bash script + └── tests/ + ├── __init__.py + └── latest/ + ├── __init__.py + ├── test_nvme_conversion.py # Scenario tests + ├── test_validators.py # Unit tests for validators + ├── test_os_preparation.py # Unit tests for OS prep + └── recordings/ # VCR test recordings +``` + +--- + +## 4. Implementation Roadmap + +### Phase 1: Scaffold & Core Infrastructure +**Estimated complexity: Low** + +- [x] 1. **Create extension scaffold** + - [x] `setup.py`, `setup.cfg`, `README.md`, `HISTORY.rst` + - [x] `azext_metadata.json` + - [x] `__init__.py` with `CommandsLoader` + - [x] Empty `commands.py`, `_params.py`, `_help.py` + +- [x] 2. **Client factory** (`_client_factory.py`) + - [x] Create `ComputeManagementClient` from CLI context + - [x] Handle subscription context + +- [x] 3. **Register commands** (`commands.py`) + - [x] `nvme-conversion convert` + - [x] `nvme-conversion check` + - [x] `nvme-conversion revert` — *not needed: `convert --controller-type SCSI` handles this* + +- [x] 4. **Define parameters** (`_params.py`) + - [x] All arguments with types, validators, help text + - [x] Enum choices for `--controller-type` + +- [x] 5. **Define help text** (`_help.py`) + - [x] Command group help + - [x] Per-command help with examples + +### Phase 2: Validation Logic +**Estimated complexity: Medium** + +- [x] 6. **VM validation** (`custom.py`) + - [x] `_validate_vm()` — fetch VM, raise if not found + - [x] `_check_vm_power_state()` — check power state + - [x] `_check_ade_extension()` — block ADE for Linux + - [x] `_check_vm_generation()` — must be Gen2 + - [x] `_check_current_controller()` — detect SCSI/NVMe + - [x] `_detect_os_type()` — Windows vs Linux + - [x] `_check_windows_version()` — >= 2019 + +- [x] 7. **SKU validation** (`custom.py`) + - [x] `_validate_sku()` — target SKU exists + - [x] Zone availability check + - [x] Controller support check + - [x] Resource disk compatibility (Windows) + +- [x] 8. **Validators** (`_validators.py`) + - [x] `validate_vm_size` — must match Standard_* pattern + - [x] `validate_sleep_seconds` — non-negative, max 600 + +### Phase 3: OS Preparation +**Estimated complexity: High** + +- [x] 9. **OS preparation** (`custom.py`) + - [x] `_prepare_windows()` — run stornvme check/fix via RunCommand + - [x] `_prepare_linux()` — run bash script via RunCommand + - [x] `_check_os_readiness()` — check-only wrapper + +- [x] 10. **Linux bash script** (`_linux_script.py`) + - [x] Extract the embedded ~453-line bash script + - [x] Parameterize it (accept `-fix` and `-dryrun` flags) + - [x] Wrap in `get_linux_check_script()` function + +### Phase 4: Conversion Operations +**Estimated complexity: Medium** + +- [x] 11. **Disk operations** (`custom.py`) + - [x] `_update_disk_capabilities()` — update OS disk supportedCapabilities + - [x] Use `disks.begin_update()` instead of raw REST + +- [x] 12. **VM operations** (`custom.py`) + - [x] `_stop_vm()` — deallocate + - [x] `_start_vm()` — start with optional delay + - [x] `_update_vm()` — resize + set DiskControllerType + +### Phase 5: Command Orchestration +**Estimated complexity: Medium** + +- [x] 13. **`convert` command** (`custom.py`) + - [x] Orchestrate: validate → check OS → confirm → shutdown → update disk → update VM → start + - [x] Progress output via `logger.warning()` for user-visible messages + - [x] Handle `--yes` for non-interactive mode (via `confirmation=True`) + - [x] Handle `--dry-run` for Linux + +- [x] 14. **`check` command** (`custom.py`) + - [x] Run validation + OS checks only, output report + - [x] Return structured JSON with pass/fail per check + +- [x] 15. **`revert` command** — *not needed: `convert --controller-type SCSI` handles this* + +### Phase 6: Testing +**Estimated complexity: High** + +- [x] 16. **Unit tests** (`test_nvme_conversion.py`) + - [x] Test OS type detection (Windows, Linux, None) + - [x] Test controller state detection + - [x] Test Windows version validation + - [x] Test VM generation check (V1 blocked, V2 passes) + - [x] Test Linux script content validation + +- [x] 17. **Unit tests — OS preparation** + - [x] Mock RunCommand responses + - [x] Test Windows check parsing (pass, Start error, StartOverride error) + - [x] Test Linux script output parsing (pass, error, fix mode, dry-run mode) + +- [x] 18. **Scenario tests** (`test_nvme_conversion.py`) + - [x] Mocked end-to-end convert: SCSI→NVMe, NVMe→SCSI + - [x] Mocked end-to-end check: pass, gen1 fail, ADE fail, VM not found + - [x] Convert already-on-target returns no-change + - [x] Convert with start-vm / without start-vm / no-wait + - [x] Convert dry-run stops before shutdown + - [x] Convert includes revert command for NVMe target + - [x] VM size auto-resolution tests (4 tests) + +- [x] 19. **Live tests** + - [x] Ubuntu, RHEL, SLES, Azure Linux 4, Windows 2022, Windows 2019 + - [x] Full conversion round-trip (SCSI→NVMe→SCSI) + +### Phase 7: Polish & Release +**Estimated complexity: Low** + +- [x] 20. **Error handling** + - [x] User-friendly error messages + - [x] Actionable suggestions on failure + - [x] Revert instructions on conversion failure + - [x] `--no-wait` support for long operations + +- [x] 21. **Output formatting** + - [x] Table format for check results + - [x] JSON output for automation + - [x] Table format for convert results + +- [x] 22. **Documentation** + - [x] `README.md` with usage examples + - [x] `HISTORY.rst` with initial version + - [x] Help text with realistic examples + +- [x] 23. **CI integration** + - [x] Linting (`flake8`) — all files pass + - [x] Pylint — 9.89/10, all issues fixed + - [x] `azdev style nvme-conversion` — PASSED + - [x] All Python files compile without errors + - [x] `azdev linter` — PASSED (1 issue fixed: added --ignore-win-ver short alias) + - [ ] Register in `src/index.json` (auto on merge to main) + +--- + +## 5. Key Design Decisions + +### Why NOT use AAZ (Auto-generated commands)? +The script orchestrates **multiple Azure operations** (VM get → extension check → SKU check → RunCommand → VM stop → disk PATCH → VM update → VM start). AAZ is designed for single-resource CRUD operations mapping to a single REST endpoint. This extension requires custom orchestration logic. + +### SDK vs Raw REST +The original script uses `Invoke-RestMethod` for the OS disk PATCH. The Python SDK's `disks.begin_update()` can achieve the same result without raw REST calls, which is cleaner and benefits from SDK retry/error handling. + +### Embedded Bash Script +The ~500-line Linux bash script will be kept as a string constant in `_os_preparation.py` (or loaded from `scripts/linux_nvme_check.sh`). It's sent to the VM via RunCommand — it never executes locally. + +### Confirmation Prompts +The original script uses `Read-Host` for confirmation. The CLI extension will use `--yes/-y` for non-interactive mode and `az cli`'s built-in confirmation via `user_confirmation()`. + +### Logging +PowerShell's `WriteRunLog` maps to Python's `logger`: +- `INFO` → `logger.info()` (only shown with `--debug`) +- `WARNING` → `logger.warning()` (always shown) +- `ERROR` → `raise CLIError()` or `raise ValidationError()` +- `IMPORTANT` → `logger.warning()` with prominent formatting + +--- + +## 6. Dependencies + +```python +DEPENDENCIES = [ + 'azure-mgmt-compute>=33.0.0', +] +``` + +The `azure-mgmt-compute` package provides: +- `ComputeManagementClient` — VM, disk, extension, SKU operations +- `VirtualMachine`, `Disk`, `RunCommandInput` models + +--- + +## 7. Risk Assessment + +| Risk | Impact | Mitigation | +|---|---|---| +| Long-running operations (VM stop/start) | User confusion | Progress indicators, `--no-wait` support | +| RunCommand timeout on large VMs | Script appears stuck | Set explicit timeouts, stream output | +| SKU API returns stale data | Wrong validation result | Cache with TTL, warn user | +| Linux bash script fails on unknown distro | Conversion proceeds without OS prep | Fail-safe: block conversion if OS check fails | +| Disk PATCH fails | Disk in inconsistent state | Disk update is safe — no data loss, retry-able | +| VM won't start after conversion | VM stuck deallocated | Provide revert command with original size | + +--- + +## 8. Testing Strategy + +### Test Pyramid + +``` + ┌──────────────┐ + │ Live Tests │ ← Optional, CI-gated, real Azure resources + │ (2-3) │ + ┌┴──────────────┴┐ + │ Scenario Tests │ ← Recorded HTTP, full command flow + │ (8-10) │ + ┌┴────────────────┴┐ + │ Unit Tests │ ← Mocked, fast, high coverage + │ (20-30) │ + └──────────────────┘ +``` + +### Unit Tests +- Validator functions with various inputs +- SKU capability parsing +- OS type detection +- Windows/Linux check output parsing +- Error condition handling + +### Scenario Tests (with VCR recordings) +- Happy path: SCSI → NVMe conversion +- Happy path: NVMe → SCSI conversion +- Check command output format +- Revert command +- Error: VM not found +- Error: Gen1 VM blocked +- Error: ADE detected +- Error: SKU doesn't support NVMe +- Error: Windows version too old +- Dry-run mode (Linux) + +### Live Tests +- Full round-trip conversion on a test VM +- Verify VM boots correctly after conversion +- Verify disk controller type changed diff --git a/src/nvme-conversion/TESTING.md b/src/nvme-conversion/TESTING.md new file mode 100755 index 00000000000..fec064a99ac --- /dev/null +++ b/src/nvme-conversion/TESTING.md @@ -0,0 +1,99 @@ +# NVMe Conversion Extension — Live Test Tracker + +**Date**: 2026-05-10 +**Resource Group**: `nvme-conversion-test` +**Location**: `westus3` +**Extension version**: `1.0.0b1` +**az CLI versions**: WSL 2.86.0 / Windows 2.85.0 + +## Test VMs + +| VM Name | OS | Image | Size | Initial Controller | Status | +|---|---|---|---|---|---| +| nvme-test-ubuntu | Linux | Canonical:ubuntu-24_04-lts:server:latest | Standard_D2s_v5 | SCSI | Provisioned | +| nvme-test-rhel | Linux | RedHat:RHEL:9-lvm-gen2:latest | Standard_D2s_v5 | SCSI | Provisioned | +| nvme-test-sles | Linux | SUSE:sles-15-sp6:gen2:latest | Standard_D2s_v5 | SCSI | Provisioned | +| nvme-test-azl4 | Linux | microsoftazurelinux:azurelinux-4:4:latest | Standard_D2s_v5 | SCSI | Provisioned | +| nvme-test-win | Windows | MicrosoftWindowsServer:WindowsServer:2022-datacenter-g2:latest | Standard_D2s_v5 | SCSI | Provisioned | + +## Test Plan + +### Phase A — `check` command (read-only, no changes) + +| # | Test | VM | CLI | Expected | Result | Notes | +|---|---|---|---|---|---|---| +| A1 | Check Ubuntu (SCSI→NVMe, no --vm-size) | nvme-test-ubuntu | WSL | passed (SKU warns) | **PASS** | After fix: warns about missing DiskControllerTypes but passes | +| A1-pre | Check Ubuntu (before SKU fix) | nvme-test-ubuntu | WSL | skuValidation failed | **PASS** | Correctly detected D2s_v5 lacks NVMe before the fix | +| A2 | Check RHEL (--ignore-sku-check) | nvme-test-rhel | WSL | passed | **PASS** | All checks pass including osReadiness via RunCommand | +| A3 | Check SLES (--ignore-sku-check, no OS) | nvme-test-sles | WSL | passed | **PASS** | Validation checks pass (ADE, Gen2, controller) | +| A4 | Check AzL4 (--ignore-sku-check, no OS) | nvme-test-azl4 | WSL | passed | **PASS** | Validation checks pass | +| A5 | Check Win2022 from Windows CLI | nvme-test-win | **Win CMD** | osReadiness failed | **PASS** | StartOverride:ERROR detected correctly. windowsVersion: passed | +| A6 | Check Ubuntu full (all checks) | nvme-test-ubuntu | WSL | passed | **PASS** | All 7 checks pass including osReadiness and skuValidation | +| A7 | Check already-on-target (--controller-type SCSI) | nvme-test-ubuntu | WSL | info/no-change | **PASS** | Returns `controllerCheck: info` | + +> **Finding A1b (critical)**: No v5 D-series SKU has `DiskControllerTypes` in the SKU API. +> Only v6+ and Ebds/Ebs v5 series advertise it. When absent, it means SCSI-only. +> +> **Fix applied**: SKU validation now treats missing `DiskControllerTypes` as "unknown" — +> warns but doesn't block. The VM update API will fail safely if the SKU truly doesn't support it. +> +> **Finding A5**: Windows Server 2022 has `StartOverride` registry key that blocks NVMe. +> This is expected and fixable with `--fix-os`. The check correctly identifies this. + +### Phase B — `convert` command (SCSI → NVMe) + +| # | Test | VM | CLI | Expected | Result | Notes | +|---|---|---|---|---|---|---| +| B1 | Convert Ubuntu SCSI→NVMe (--vm-size Standard_D2s_v6 --fix-os --start-vm) | nvme-test-ubuntu | WSL | succeeded, NVMe | **PASS** | All 8 steps completed, VM running on NVMe | +| B2 | Convert RHEL SCSI→NVMe (--vm-size Standard_D2s_v6 --fix-os --start-vm) | nvme-test-rhel | WSL | succeeded, NVMe | **PASS** | | +| B3 | Convert SLES SCSI→NVMe (--vm-size Standard_D2s_v6 --fix-os --start-vm) | nvme-test-sles | WSL | succeeded, NVMe | **PASS** | | +| B4 | Convert AzL4 SCSI→NVMe (--vm-size Standard_D2s_v6 --fix-os --start-vm) | nvme-test-azl4 | WSL | succeeded, NVMe | **PASS** | | +| B5 | Convert Windows SCSI→NVMe (--vm-size Standard_D2s_v6 --fix-os --start-vm) | nvme-test-win | WSL | succeeded, NVMe | **PASS** | | + +### Phase C — Post-conversion validation + +| # | Test | VM | CLI | Expected | Result | Notes | +|---|---|---|---|---|---|---| +| C1 | Verify Ubuntu is NVMe after boot | nvme-test-ubuntu | WSL | controller=NVMe | **PASS** | az vm show confirms NVMe | +| C2 | Verify RHEL is NVMe after boot | nvme-test-rhel | WSL | controller=NVMe | **PASS** | | +| C3 | Verify SLES is NVMe after boot | nvme-test-sles | WSL | controller=NVMe | **PASS** | | +| C4 | Verify AzL4 is NVMe after boot | nvme-test-azl4 | WSL | controller=NVMe | **PASS** | | +| C5 | Verify Windows is NVMe after boot | nvme-test-win | WSL | controller=NVMe | **PASS** | | + +### Phase D — `convert` command (NVMe → SCSI revert) + +| # | Test | VM | CLI | Expected | Result | Notes | +|---|---|---|---|---|---|---| +| D1 | Convert Ubuntu NVMe→SCSI (--start-vm) | nvme-test-ubuntu | WSL | succeeded, SCSI | **PASS** | Reverted to D2s_v5 SCSI, VM running | +| D2 | Convert Windows NVMe→SCSI (--start-vm) | nvme-test-win | WSL | succeeded, SCSI | **PASS** | Reverted to D2s_v5 SCSI, VM running | + +### Phase E — Edge cases & error paths + +| # | Test | VM | CLI | Expected | Result | Notes | +|---|---|---|---|---|---|---| +| E1 | Convert already-NVMe with --controller-type NVMe | nvme-test-rhel | WSL | no-change | **PASS** | Returns status:no-change cleanly | +| E2 | Check with --ignore-os-check | nvme-test-ubuntu | WSL | passed (skips OS) | **PASS** | No powerState/osReadiness checks in output | +| E3 | Convert with --dry-run (Linux) | nvme-test-sles | WSL | dry-run-complete | **PASS** | VM unchanged, returns immediately | +| E4 | Convert with --no-wait --start-vm | nvme-test-azl4 | WSL | succeeded, no wait | **PASS** | Conversion completed, VM reverted to SCSI | +| E5 | Check from Windows CLI | nvme-test-win | **Win CMD** | osReadiness failed | **PASS** | Extension works correctly from Windows CMD | + +### Phase F — Windows version coverage + +| # | Test | VM | CLI | Expected | Result | Notes | +|---|---|---|---|---|---|---| +| F1 | Check Win2022 (from WSL) | nvme-test-win | WSL | osReadiness: StartOverride:ERROR | **PASS** | Most popular server version | +| F2 | Check Win2022 (from Win CMD) | nvme-test-win | **Win CMD** | osReadiness: StartOverride:ERROR | **PASS** | Same result from both CLIs | +| F3 | Check Win2019 (from WSL) | nvme-test-w2019 | WSL | windowsVersion: passed, osReadiness: failed | **PASS** | Min NVMe version, StartOverride issue detected | +| F4 | Check Win2019 (from Win CMD) | nvme-test-w2019 | **Win CMD** | windowsVersion: passed, osReadiness: failed | **PASS** | Identical results from both CLIs | + +> **Most used Windows Server versions on Azure** (by market share): +> 1. Windows Server 2022 — current mainstream (tested: Win2022 ✓) +> 2. Windows Server 2019 — still widely deployed, minimum for NVMe (testing: Win2019) +> 3. Windows Server 2025 — newest, same NVMe behavior as 2022 (not testing: same code path) +> 4. Windows Server 2016 — legacy, BLOCKED by our version check (< 2019) + +### Phase G — Cleanup + +| # | Task | Result | Notes | +|---|---|---|---| +| G1 | Delete resource group nvme-conversion-test | | | diff --git a/src/nvme-conversion/azext_nvme_conversion/__init__.py b/src/nvme-conversion/azext_nvme_conversion/__init__.py new file mode 100755 index 00000000000..d2abbcd609c --- /dev/null +++ b/src/nvme-conversion/azext_nvme_conversion/__init__.py @@ -0,0 +1,30 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +"""Azure CLI extension for converting VM disk controllers between SCSI and NVMe.""" + +from azure.cli.core import AzCommandsLoader +from azext_nvme_conversion._help import helps # pylint: disable=unused-import + + +class NvmeConversionCommandsLoader(AzCommandsLoader): + + def __init__(self, cli_ctx=None): + from azure.cli.core.commands import CliCommandType + custom_command_type = CliCommandType( + operations_tmpl='azext_nvme_conversion.custom#{}') + super().__init__(cli_ctx=cli_ctx, custom_command_type=custom_command_type) + + def load_command_table(self, args): + from azext_nvme_conversion.commands import load_command_table + load_command_table(self, args) + return self.command_table + + def load_arguments(self, command): + from azext_nvme_conversion._params import load_arguments + load_arguments(self, command) + + +COMMAND_LOADER_CLS = NvmeConversionCommandsLoader diff --git a/src/nvme-conversion/azext_nvme_conversion/_client_factory.py b/src/nvme-conversion/azext_nvme_conversion/_client_factory.py new file mode 100755 index 00000000000..bdbe958452e --- /dev/null +++ b/src/nvme-conversion/azext_nvme_conversion/_client_factory.py @@ -0,0 +1,13 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +"""Client factory for Azure Compute Management SDK.""" + + +def cf_compute(cli_ctx, **_): + """Create a ComputeManagementClient from the CLI context.""" + from azure.cli.core.commands.client_factory import get_mgmt_service_client + from azure.mgmt.compute import ComputeManagementClient + return get_mgmt_service_client(cli_ctx, ComputeManagementClient) diff --git a/src/nvme-conversion/azext_nvme_conversion/_format.py b/src/nvme-conversion/azext_nvme_conversion/_format.py new file mode 100755 index 00000000000..0b6d76d5329 --- /dev/null +++ b/src/nvme-conversion/azext_nvme_conversion/_format.py @@ -0,0 +1,38 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +"""Output table formatters for nvme-conversion commands.""" + + +def check_result_table_format(result): + """Format the check command output as a table.""" + if not result: + return [] + + rows = [] + for check_name, check_result in result.get('checks', {}).items(): + rows.append({ + 'Check': check_name, + 'Status': check_result.get('status', 'unknown'), + 'Message': check_result.get('message', ''), + }) + + return rows + + +def convert_result_table_format(result): + """Format the convert command output as a table.""" + if not result: + return [] + + return [{ + 'VM': result.get('vm', ''), + 'ResourceGroup': result.get('resourceGroup', ''), + 'Status': result.get('status', ''), + 'PreviousSize': result.get('previousSize', ''), + 'NewSize': result.get('newSize', ''), + 'ControllerType': result.get('controllerType', ''), + 'VMStarted': str(result.get('vmStarted', '')), + }] diff --git a/src/nvme-conversion/azext_nvme_conversion/_help.py b/src/nvme-conversion/azext_nvme_conversion/_help.py new file mode 100755 index 00000000000..9318b7326e3 --- /dev/null +++ b/src/nvme-conversion/azext_nvme_conversion/_help.py @@ -0,0 +1,147 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +"""Help text definitions for nvme-conversion commands.""" + +from knack.help_files import helps # pylint: disable=unused-import + + +helps['nvme-conversion'] = """ +type: group +short-summary: Convert VM disk controllers between SCSI and NVMe. +long-summary: | + Validate, prepare, and convert Azure Virtual Machines from SCSI to NVMe disk controllers + and back. Unlike 'az vm update --disk-controller-type' which only sets the controller + property, this extension handles the full conversion lifecycle: pre-flight validation + (Gen2, ADE, SKU capabilities), OS readiness checks and fixes (stornvme on Windows, + initramfs/grub/fstab on Linux), OS disk capability update, VM deallocate, resize, + controller change, and optional restart — all in a single command. + For more information, see https://learn.microsoft.com/azure/virtual-machines/enable-nvme-interface +""" + +helps['nvme-conversion convert'] = """ +type: command +short-summary: Convert a VM's disk controller between SCSI and NVMe. +long-summary: | + Performs the full conversion flow in 8 steps: + [1] Validate VM exists and is Gen2 + [2] Resolve target controller type and VM size + [3] Check prerequisites (ADE, generation, power state) + [4] Validate target SKU capabilities + [5] Check/fix OS readiness via RunCommand + [6] Shut down (deallocate) the VM + [7] Update OS disk capabilities and VM size + [8] Optionally start the VM + + If --controller-type is not specified, the command auto-detects the current type + and toggles to the opposite. If --vm-size is not specified and the current size + supports both SCSI and NVMe, the current size is kept. +examples: + - name: Convert a VM to NVMe (auto-detect, keep current size) + text: > + az nvme-conversion convert + --resource-group MyResourceGroup + --vm-name MyVM + --start-vm + --yes + --verbose + - name: Convert a VM to NVMe with a specific target size + text: > + az nvme-conversion convert + --resource-group MyResourceGroup + --vm-name MyVM + --vm-size Standard_E4bds_v5 + --start-vm + --yes + --verbose + - name: Convert a VM back to SCSI + text: > + az nvme-conversion convert + --resource-group MyResourceGroup + --vm-name MyVM + --controller-type SCSI + --vm-size Standard_E4s_v5 + --start-vm + --yes + --verbose + - name: Convert with automatic OS fixes (stornvme driver on Windows, initramfs/grub/fstab on Linux) + text: > + az nvme-conversion convert + --resource-group MyResourceGroup + --vm-name MyVM + --fix-os + --start-vm + --yes + --verbose + - name: Dry-run on a Linux VM to assess readiness without making any changes + text: > + az nvme-conversion convert + --resource-group MyResourceGroup + --vm-name MyLinuxVM + --dry-run + --verbose + - name: Convert without waiting for the VM to fully start + text: > + az nvme-conversion convert + --resource-group MyResourceGroup + --vm-name MyVM + --start-vm + --no-wait + --yes + --verbose + - name: Convert and skip OS readiness check (use when OS is already prepared) + text: > + az nvme-conversion convert + --resource-group MyResourceGroup + --vm-name MyVM + --ignore-os-check + --start-vm + --yes + --verbose +""" + +helps['nvme-conversion check'] = """ +type: command +short-summary: Check VM readiness for disk controller conversion without making changes. +long-summary: | + Runs all pre-flight validation checks and reports the results as a JSON object + with pass/fail status per check. Checks include: + - VM exists and is Generation 2 + - No Azure Disk Encryption for Linux + - Current disk controller type + - Target SKU supports the desired controller + - OS readiness (stornvme driver on Windows, NVMe driver/grub/fstab on Linux) + + If --controller-type is not specified, checks readiness for toggling to the opposite type. + Use this command to validate VMs before conversion, especially in bulk scenarios. +examples: + - name: Check if a VM is ready for conversion + text: > + az nvme-conversion check + --resource-group MyResourceGroup + --vm-name MyVM + --verbose + - name: Check readiness targeting a specific VM size + text: > + az nvme-conversion check + --resource-group MyResourceGroup + --vm-name MyVM + --vm-size Standard_E4bds_v5 + --verbose + - name: Check readiness skipping OS checks (faster, no RunCommand) + text: > + az nvme-conversion check + --resource-group MyResourceGroup + --vm-name MyVM + --ignore-os-check + --verbose + - name: Check if a VM can be converted to SCSI + text: > + az nvme-conversion check + --resource-group MyResourceGroup + --vm-name MyVM + --controller-type SCSI + --verbose +""" diff --git a/src/nvme-conversion/azext_nvme_conversion/_linux_checks.py b/src/nvme-conversion/azext_nvme_conversion/_linux_checks.py new file mode 100755 index 00000000000..425ae15d51e --- /dev/null +++ b/src/nvme-conversion/azext_nvme_conversion/_linux_checks.py @@ -0,0 +1,80 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +"""Linux-specific OS checks and fixes for NVMe conversion. + +Steps performed (via embedded bash script sent through RunCommand): + 1. check_azure_vm_utils — Warn if azure-vm-utils is not installed + 2. check_nvme_driver — Verify NVMe driver in initrd/initramfs or kernel built-in + 3. check_nvme_timeout — Verify nvme_core.io_timeout=240 in grub + 4. check_fstab — Check /etc/fstab for deprecated /dev/sd* and /dev/disk/azure/scsi* entries + +Supported distros: Ubuntu, Debian, RHEL, CentOS, Rocky, AlmaLinux, SLES, OL, Azure Linux, Mariner + +To add a new Linux check: + - Add a new function in the bash script (_linux_script.py) + - Call it from the "Run the checks" section at the bottom of the script + - If fixing is supported, honor the $fix and $dry_run flags +""" + +import logging + +from azure.cli.core.azclierror import ValidationError + +logger = logging.getLogger(__name__) + + +def prepare_linux(compute_client, resource_group_name, vm_name, fix_os, dry_run): + """Check/fix Linux NVMe readiness via RunCommand with embedded bash script. + + The bash script is sent to the VM and executed remotely. It checks: + - azure-vm-utils presence (recommended for NVMe symlinks and io_timeout) + - NVMe driver availability in initrd/initramfs or as kernel built-in + - nvme_core.io_timeout=240 grub kernel parameter + - /etc/fstab for deprecated device paths that break on NVMe + + Args: + fix_os: If True, the script will attempt to fix issues (rebuild initramfs, + update grub, replace fstab entries with UUIDs). + dry_run: If True, stage proposed changes in /tmp/nvme-conversion-dryrun/ + without modifying the system. + """ + from azext_nvme_conversion._linux_script import get_linux_check_script + from azure.mgmt.compute.models import RunCommandInput + + args = [] + if fix_os: + args.append('-fix') + if dry_run: + args.append('-dryrun') + + script_text = get_linux_check_script() + # RunCommand parameters become positional args ($1, $2, ...) for the script + params = [{'name': f'arg{i}', 'value': a} for i, a in enumerate(args)] if args else None + run_input = RunCommandInput( + command_id='RunShellScript', + script=[script_text], + parameters=params + ) + + logger.warning('Running Linux NVMe readiness check%s...', ' (dry-run)' if dry_run else '') + result = compute_client.virtual_machines.begin_run_command( + resource_group_name, vm_name, run_input).result(timeout=600) + + errors = [] + for output in (result.value or []): + message = output.message or '' + for line in message.split('\n'): + line = line.strip() + if line: + if '[ERROR]' in line: + errors.append(line) + logger.warning(' OS check: %s', line) + + if errors and not fix_os and not dry_run: + raise ValidationError( + 'Linux OS is not ready for NVMe. Issues found:\n' + + '\n'.join(f' - {e}' for e in errors) + + '\nUse --fix-os to automatically fix or --dry-run to stage changes.') diff --git a/src/nvme-conversion/azext_nvme_conversion/_linux_script.py b/src/nvme-conversion/azext_nvme_conversion/_linux_script.py new file mode 100755 index 00000000000..ec8f858bd95 --- /dev/null +++ b/src/nvme-conversion/azext_nvme_conversion/_linux_script.py @@ -0,0 +1,560 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +"""Embedded Linux bash script for NVMe readiness checks. + +The script is sent to VMs via Azure RunCommand and executes remotely. +It never runs locally. See _linux_checks.py for the Python wrapper. +""" + + +def get_linux_check_script(): + """Return the Linux NVMe readiness check bash script. + + This script is sent to the VM via RunCommand. It checks: + - azure-vm-utils presence (recommended for NVMe symlinks and io_timeout) + - NVMe driver in initrd/initramfs (or built into kernel) + - nvme_core.io_timeout grub parameter + - /etc/fstab for deprecated device names (/dev/sd*, /dev/disk/azure/scsi*) + + Supports: Ubuntu, Debian, RHEL, CentOS, Rocky, AlmaLinux, SLES, OL, Azure Linux, Mariner + Flags: -fix (apply fixes), -dryrun (stage changes in /tmp/nvme-conversion-dryrun/) + """ + return _LINUX_SCRIPT + + +_LINUX_SCRIPT = r"""#!/bin/bash + +# Set default values +fix=false +dry_run=false +distro="" + +# Staging directory for dry-run mode +staging_dir="" + +setup_dryrun() { + staging_dir="/tmp/nvme-conversion-dryrun" + rm -rf "$staging_dir" + mkdir -p "$staging_dir/original" "$staging_dir/modified" "$staging_dir/diffs" + echo "$(hostname)" > "$staging_dir/hostname" + echo "$distro" > "$staging_dir/distro" + uname -r > "$staging_dir/kernel" + echo "[INFO] Dry-run mode: staging changes in $staging_dir" +} + +# Function to display usage +usage() { + echo "Usage: $0 [-fix] [-dryrun]" + exit 1 +} + +# Parse command line arguments +while [ $# -gt 0 ]; do + case "$1" in + -fix) + fix=true + ;; + -dryrun) + dry_run=true + fix=true + ;; + *) + usage + ;; + esac + shift +done + +# Determine the Linux distribution +if [ -f /etc/os-release ]; then + source /etc/os-release + distro="$ID" +elif [ -f /etc/debian_version ]; then + distro="debian" +elif [ -f /etc/SuSE-release ]; then + distro="suse" +elif [ -f /etc/redhat-release ]; then + distro="redhat" +elif [ -f /etc/centos-release ]; then + distro="centos" +elif [ -f /etc/rocky-release ]; then + distro="rocky" +else + echo "[ERROR] Unsupported distribution." + exit 1 +fi +echo "[INFO] Operating system detected: $distro" + +# Setup dry-run staging if enabled +if $dry_run && $fix; then + setup_dryrun +fi + +# Function to check if azure-vm-utils is installed +check_azure_vm_utils() { + echo "[INFO] Checking for azure-vm-utils..." + + if command -v azure-nvme-id &>/dev/null; then + echo "[INFO] azure-vm-utils is installed (azure-nvme-id found)." + if [ -f /etc/udev/rules.d/80-azure-disk.rules ] || [ -f /lib/udev/rules.d/80-azure-disk.rules ] || [ -f /usr/lib/udev/rules.d/80-azure-disk.rules ]; then + echo "[INFO] 80-azure-disk.rules is present. NVMe disk symlinks and io_timeout will be managed by udev." + else + echo "[WARNING] azure-nvme-id found but 80-azure-disk.rules is missing. Udev symlinks may not work after conversion." + fi + else + echo "[WARNING] azure-vm-utils is not installed." + echo "[WARNING] After conversion, /dev/disk/azure/ symlinks for NVMe disks may not be available." + echo "[WARNING] Install azure-vm-utils from https://github.com/Azure/azure-vm-utils for best NVMe experience." + if $fix; then + _install_fallback_udev_rules + else + echo "[WARNING] Use --fix-os to install a fallback udev rule for basic NVMe symlinks and io_timeout." + fi + fi +} + +# Fallback udev rule for NVMe disks when azure-vm-utils is not available. +# Provides: io_timeout=240s, /dev/disk/azure/root, /dev/disk/azure/data/by-lun/N +# Does NOT provide: by-name, by-serial, by-index (those require azure-nvme-id binary) +# Named 99- so that 80-azure-disk.rules takes precedence if azure-vm-utils is later installed. +_FALLBACK_UDEV_RULE='# Fallback Azure NVMe udev rules (installed by az nvme-conversion) +# Remove this file after installing azure-vm-utils (which provides 80-azure-disk.rules) +ACTION!="add|change", GOTO="azure_nvme_fallback_end" +SUBSYSTEM!="block", GOTO="azure_nvme_fallback_end" +KERNEL!="nvme*", GOTO="azure_nvme_fallback_end" +ENV{ID_MODEL}!="MSFT NVMe Accelerator v1.0", GOTO="azure_nvme_fallback_end" + +# Set io_timeout to 240 seconds for remote NVMe disks +ENV{DEVTYPE}=="disk", ATTRS{nsid}=="?*", ATTR{queue/io_timeout}="240000" + +# OS disk: namespace ID 1 +KERNEL=="nvme*[0-9]n1", ENV{DEVTYPE}=="disk", SYMLINK+="disk/azure/root" +KERNEL=="nvme*[0-9]n1p[0-9]*", ENV{DEVTYPE}=="partition", SYMLINK+="disk/azure/root-part%n" + +# Data disks: namespace ID 2+ maps to LUN = nsid - 2 +KERNEL=="nvme*[0-9]n*[0-9]", ENV{DEVTYPE}=="disk", ATTRS{nsid}!="1", PROGRAM="/bin/sh -ec '\''echo $(($(cat /sys/class/block/%k/nsid) - 2))'\''", SYMLINK+="disk/azure/data/by-lun/%c" +KERNEL=="nvme*[0-9]n*[0-9]p[0-9]*", ENV{DEVTYPE}=="partition", ATTRS{nsid}!="1", PROGRAM="/bin/sh -ec '\''echo $(($(cat /sys/class/block/$(echo %k | sed s/p[0-9]*$//)/nsid) - 2))'\''", SYMLINK+="disk/azure/data/by-lun/%c-part%n" + +LABEL="azure_nvme_fallback_end"' + +_install_fallback_udev_rules() { + local _target="/etc/udev/rules.d/99-azure-nvme-fallback.rules" + + if [ -f "$_target" ]; then + echo "[INFO] Fallback udev rule already installed at $_target" + return + fi + + if $dry_run; then + echo "[DRYRUN] Would install fallback udev rule at $_target" + echo "$_FALLBACK_UDEV_RULE" > "$staging_dir/modified/99-azure-nvme-fallback.rules" + return + fi + + echo "[INFO] Installing fallback NVMe udev rule at $_target" + echo "$_FALLBACK_UDEV_RULE" > "$_target" + udevadm control --reload-rules 2>/dev/null || true + echo "[INFO] Fallback udev rule installed. Provides:" + echo "[INFO] - io_timeout=240s for NVMe remote disks" + echo "[INFO] - /dev/disk/azure/root symlink" + echo "[INFO] - /dev/disk/azure/data/by-lun/N symlinks" + echo "[INFO] Note: by-name/by-serial/by-index require azure-vm-utils" +} + +# Function to check if NVMe driver is in initrd/initramfs or built into the kernel +check_nvme_driver() { + echo "[INFO] Checking if NVMe driver is available for boot..." + + # Check if nvme is compiled directly into the kernel (built-in) + if grep -qw nvme "/lib/modules/$(uname -r)/modules.builtin" 2>/dev/null; then + echo "[INFO] NVMe driver is built into the kernel. No initramfs entry needed." + if $dry_run && $fix; then + echo "[DRYRUN] NVMe driver is built-in (kernel $(uname -r)). No initramfs or dracut changes needed." + echo "nvme_builtin=true" > "$staging_dir/modified/nvme-driver-status.txt" + echo "kernel=$(uname -r)" >> "$staging_dir/modified/nvme-driver-status.txt" + grep -w nvme "/lib/modules/$(uname -r)/modules.builtin" >> "$staging_dir/modified/nvme-driver-status.txt" + fi + return 0 + fi + + echo "[INFO] NVMe is not built-in. Checking initrd/initramfs..." + case "$distro" in + ubuntu|debian) + _initramfs_ok=true + if ! lsinitramfs /boot/initrd.img-* 2>/dev/null | grep -q nvme; then + echo "[WARNING] NVMe driver not found in initrd/initramfs." + _initramfs_ok=false + fi + if ! lsinitramfs /boot/initrd.img-* 2>/dev/null | grep -qE 'hv_pci|pci.hyperv'; then + echo "[WARNING] pci-hyperv/hv_pci driver not found in initrd/initramfs (required for Azure NVMe)." + _initramfs_ok=false + fi + if $_initramfs_ok; then + echo "[INFO] NVMe driver found in initrd/initramfs." + if $dry_run && $fix; then + echo "[DRYRUN] NVMe and pci-hyperv drivers already in initramfs. No changes needed." + echo "nvme_in_initramfs=true" > "$staging_dir/modified/nvme-driver-status.txt" + echo "kernel=$(uname -r)" >> "$staging_dir/modified/nvme-driver-status.txt" + fi + else + if modinfo nvme &>/dev/null; then + echo "[INFO] NVMe module exists on disk." + fi + if $fix; then + if $dry_run; then + echo "[DRYRUN] Would run: update-initramfs -u -k all" + echo "update-initramfs -u -k all" > "$staging_dir/modified/initramfs-commands.txt" + else + echo "[INFO] Adding NVMe/pci-hyperv drivers to initrd/initramfs..." + update-initramfs -u -k all + if lsinitramfs /boot/initrd.img-* | grep -q nvme; then + echo "[INFO] NVMe driver added successfully." + else + echo "[ERROR] Failed to add NVMe driver to initrd/initramfs." + fi + fi + else + echo "[ERROR] NVMe driver not found in initrd/initramfs." + fi + fi + ;; + redhat|rhel|centos|rocky|almalinux|azurelinux|mariner|suse|sles|ol) + # Check ALL installed initramfs images, not just the running kernel + _nvme_missing=false + for _img in /boot/initramfs-*.img; do + [ -f "$_img" ] || continue + [[ "$_img" == *kdump* ]] && continue + [[ "$_img" == *rescue* ]] && continue + if ! lsinitrd "$_img" 2>/dev/null | grep -q nvme; then + echo "[WARNING] NVMe driver not found in $_img" + _nvme_missing=true + fi + if ! lsinitrd "$_img" 2>/dev/null | grep -qE 'hv_pci|pci.hyperv'; then + echo "[WARNING] pci-hyperv/hv_pci driver not found in $_img (required for Azure NVMe)" + _nvme_missing=true + fi + done + if ! $_nvme_missing; then + echo "[INFO] NVMe and pci-hyperv drivers found in initrd/initramfs." + if $dry_run && $fix; then + echo "[DRYRUN] NVMe and pci-hyperv drivers already in all initramfs images. No changes needed." + echo "nvme_in_initramfs=true" > "$staging_dir/modified/nvme-driver-status.txt" + echo "kernel=$(uname -r)" >> "$staging_dir/modified/nvme-driver-status.txt" + fi + else + if modinfo nvme &>/dev/null; then + echo "[INFO] NVMe module exists on disk but is not in all initramfs images." + fi + if $fix; then + if $dry_run; then + echo "[DRYRUN] Would run: dracut -f --regenerate-all (with nvme nvme-core in /etc/dracut.conf.d/nvme.conf)" + echo 'add_drivers+=" nvme nvme-core pci-hyperv "' > "$staging_dir/modified/dracut-nvme.conf" + echo "dracut -f --regenerate-all" >> "$staging_dir/modified/initramfs-commands.txt" + else + echo "[INFO] Adding NVMe driver to initrd/initramfs (all kernels)..." + mkdir -p /etc/dracut.conf.d + echo 'add_drivers+=" nvme nvme-core pci-hyperv "' | tee /etc/dracut.conf.d/nvme.conf > /dev/null + dracut -f --regenerate-all + _verify_ok=true + for _img in /boot/initramfs-*.img; do + [ -f "$_img" ] || continue + [[ "$_img" == *kdump* ]] && continue + [[ "$_img" == *rescue* ]] && continue + if ! lsinitrd "$_img" 2>/dev/null | grep -q nvme; then + echo "[ERROR] NVMe driver still missing in $_img after rebuild." + _verify_ok=false + fi + if ! lsinitrd "$_img" 2>/dev/null | grep -qE 'hv_pci|pci.hyperv'; then + echo "[ERROR] pci-hyperv/hv_pci driver still missing in $_img after rebuild." + _verify_ok=false + fi + done + if $_verify_ok; then + echo "[INFO] NVMe driver added successfully." + else + echo "[ERROR] Failed to add NVMe driver to all initramfs images." + fi + fi + else + echo "[ERROR] NVMe driver not found in initrd/initramfs." + fi + fi + ;; + *) + echo "[ERROR] Unsupported distribution for NVMe driver check." + return 1 + ;; + esac +} + +# Function to check nvme_core.io_timeout parameter +check_nvme_timeout() { + echo "[INFO] Checking nvme_core.io_timeout parameter..." + + # Build grub file list dynamically for verification + _grub_check_files="/etc/default/grub" + if [ -f /boot/grub2/grub.cfg ]; then + _grub_check_files="$_grub_check_files /boot/grub2/grub.cfg" + elif [ -f /boot/grub/grub.cfg ]; then + _grub_check_files="$_grub_check_files /boot/grub/grub.cfg" + fi + + if grep -q "nvme_core.io_timeout=240" $_grub_check_files 2>/dev/null; then + echo "[INFO] nvme_core.io_timeout is set to 240." + if $dry_run && $fix; then + echo "[DRYRUN] nvme_core.io_timeout already set to 240. No grub changes needed." + echo "nvme_core_io_timeout=240" > "$staging_dir/modified/nvme-timeout-status.txt" + echo "status=already_configured" >> "$staging_dir/modified/nvme-timeout-status.txt" + fi + elif command -v grubby &>/dev/null && grubby --info=ALL 2>/dev/null | grep -q "nvme_core.io_timeout=240"; then + echo "[INFO] nvme_core.io_timeout is set to 240 (BLS entries)." + else + echo "[WARNING] nvme_core.io_timeout is not set to 240." + if $fix; then + if $dry_run; then + echo "[DRYRUN] Staging grub changes..." + # Find the grub config file to stage + local grub_file="" + if [ -f /etc/default/grub ]; then + grub_file="/etc/default/grub" + fi + if [ -n "$grub_file" ]; then + cp "$grub_file" "$staging_dir/original/grub" + cp "$grub_file" "$staging_dir/modified/grub" + case "$distro" in + ubuntu) + sed -i 's/GRUB_CMDLINE_LINUX_DEFAULT="/GRUB_CMDLINE_LINUX_DEFAULT="nvme_core.io_timeout=240 /g' "$staging_dir/modified/grub" + ;; + debian) + sed -i 's/GRUB_CMDLINE_LINUX="/GRUB_CMDLINE_LINUX="nvme_core.io_timeout=240 /g' "$staging_dir/modified/grub" + ;; + suse|sles|opensuse*) + sed -i 's/GRUB_CMDLINE_LINUX_DEFAULT="/GRUB_CMDLINE_LINUX_DEFAULT="nvme_core.io_timeout=240 /g' "$staging_dir/modified/grub" + ;; + ol|azurelinux|mariner) + sed -i 's/GRUB_CMDLINE_LINUX_DEFAULT="/GRUB_CMDLINE_LINUX_DEFAULT="nvme_core.io_timeout=240 /g' "$staging_dir/modified/grub" + ;; + *) + sed -i 's/GRUB_CMDLINE_LINUX="/GRUB_CMDLINE_LINUX="nvme_core.io_timeout=240 /g' "$staging_dir/modified/grub" + ;; + esac + diff -u "$staging_dir/original/grub" "$staging_dir/modified/grub" > "$staging_dir/diffs/grub.diff" 2>&1 || true + echo "[DRYRUN] Grub diff staged in $staging_dir/diffs/grub.diff" + cat "$staging_dir/diffs/grub.diff" + # Check for BLS (BootLoaderSpec) — RHEL 8+, AlmaLinux 8+, OL 8.10+ + if grep -q "GRUB_ENABLE_BLSCFG=true" "$grub_file" 2>/dev/null; then + echo "[INFO] BLS (BootLoaderSpec) is enabled." + if command -v grubby &>/dev/null; then + echo "[DRYRUN] Would run: grubby --update-kernel=ALL --args=nvme_core.io_timeout=240" + echo "bls_enabled=true" >> "$staging_dir/modified/nvme-timeout-status.txt" + echo "grubby_available=true" >> "$staging_dir/modified/nvme-timeout-status.txt" + else + echo "[WARNING] BLS is enabled but grubby is not installed." + echo "bls_enabled=true" >> "$staging_dir/modified/nvme-timeout-status.txt" + echo "grubby_available=false" >> "$staging_dir/modified/nvme-timeout-status.txt" + fi + fi + else + echo "[DRYRUN] No grub config found to stage." + fi + else + echo "[INFO] Setting nvme_core.io_timeout to 240..." + case "$distro" in + ubuntu) + sed -i 's/GRUB_CMDLINE_LINUX_DEFAULT="/GRUB_CMDLINE_LINUX_DEFAULT="nvme_core.io_timeout=240 /g' /etc/default/grub + GRUB_DISABLE_OS_PROBER=true update-grub + ;; + debian) + sed -i 's/GRUB_CMDLINE_LINUX="/GRUB_CMDLINE_LINUX="nvme_core.io_timeout=240 /g' /etc/default/grub + GRUB_DISABLE_OS_PROBER=true update-grub + ;; + suse|sles|opensuse*) + if [ -f /etc/default/grub ]; then + sed -i 's/GRUB_CMDLINE_LINUX_DEFAULT="/GRUB_CMDLINE_LINUX_DEFAULT="nvme_core.io_timeout=240 /g' /etc/default/grub + GRUB_DISABLE_OS_PROBER=true grub2-mkconfig -o /boot/grub2/grub.cfg + else + echo "[ERROR] /etc/default/grub not found." + return 1 + fi + ;; + redhat|rhel|centos|rocky|almalinux) + if [ -f /etc/default/grub ]; then + sed -i 's/GRUB_CMDLINE_LINUX="/GRUB_CMDLINE_LINUX="nvme_core.io_timeout=240 /g' /etc/default/grub + GRUB_DISABLE_OS_PROBER=true grub2-mkconfig -o /boot/grub2/grub.cfg + # Update BLS entries if applicable (RHEL 8+, AlmaLinux 8+) + if grep -q "GRUB_ENABLE_BLSCFG=true" /etc/default/grub 2>/dev/null; then + if command -v grubby &>/dev/null; then + grubby --update-kernel=ALL --args="nvme_core.io_timeout=240" + echo "[INFO] Updated BLS entries via grubby." + fi + fi + else + echo "[ERROR] /etc/default/grub not found." + return 1 + fi + ;; + ol|azurelinux|mariner) + if [ -f /etc/default/grub ]; then + sed -i 's/GRUB_CMDLINE_LINUX_DEFAULT="/GRUB_CMDLINE_LINUX_DEFAULT="nvme_core.io_timeout=240 /g' /etc/default/grub + GRUB_DISABLE_OS_PROBER=true grub2-mkconfig -o /boot/grub2/grub.cfg + # Update BLS entries if applicable (OL 8.10+) + if grep -q "GRUB_ENABLE_BLSCFG=true" /etc/default/grub 2>/dev/null; then + if command -v grubby &>/dev/null; then + grubby --update-kernel=ALL --args="nvme_core.io_timeout=240" + echo "[INFO] Updated BLS entries via grubby." + fi + fi + else + echo "[ERROR] /etc/default/grub not found." + return 1 + fi + ;; + *) + echo "[ERROR] Unsupported distribution for nvme_core.io_timeout fix." + return 1 + ;; + esac + + if grep -q "nvme_core.io_timeout=240" $_grub_check_files 2>/dev/null; then + echo "[INFO] nvme_core.io_timeout set successfully." + elif command -v grubby &>/dev/null && grubby --info=ALL 2>/dev/null | grep -q "nvme_core.io_timeout=240"; then + echo "[INFO] nvme_core.io_timeout set successfully (BLS entries)." + else + echo "[ERROR] Failed to set nvme_core.io_timeout." + fi + fi + else + echo "[ERROR] nvme_core.io_timeout is not set to 240." + fi + fi +} + +# Function to check /etc/fstab for deprecated device names +check_fstab() { + echo "[INFO] Checking /etc/fstab for deprecated device names..." + # NOTE: /dev/mapper/* (LVM) and PARTUUID= paths survive NVMe conversion + # because they use UUID-based addressing underneath. Only /dev/sd* and + # /dev/disk/azure/scsi* paths break when disks move from SCSI to NVMe. + if grep -Eq '/dev/sd[a-z][0-9]*|/dev/disk/azure/scsi[0-9]*/lun[0-9]*' /etc/fstab; then + if $fix; then + echo "[WARNING] /etc/fstab contains deprecated device names." + if $dry_run; then + echo "[DRYRUN] Staging fstab changes..." + cp /etc/fstab "$staging_dir/original/fstab" + + # Build modified fstab in staging directory + while read -r line; do + if [[ "$line" =~ ^[^#] ]]; then + device=$(echo "$line" | awk '{print $1}') + if [[ "$device" =~ ^/dev/sd[a-z][0-9]*$ ]]; then + uuid=$(blkid "$device" | awk -F\" '/UUID=/ {print $2}') + if [ -n "$uuid" ]; then + newline=$(echo "$line" | sed "s|$device|UUID=$uuid|g") + echo "[DRYRUN] Would replace $device with UUID=$uuid" + echo "$newline" >> "$staging_dir/modified/fstab" + else + echo "[DRYRUN] Could not find UUID for $device. Would skip." + echo "$line" >> "$staging_dir/modified/fstab" + fi + elif [[ "$device" =~ ^/dev/disk/azure/scsi[0-9]*/lun[0-9]* ]]; then + uuid=$(blkid "$device" | awk -F\" '/UUID=/ {print $2}') + if [ -n "$uuid" ]; then + newline=$(echo "$line" | sed "s|$device|UUID=$uuid|g") + echo "[DRYRUN] Would replace $device with UUID=$uuid" + echo "$newline" >> "$staging_dir/modified/fstab" + else + echo "[DRYRUN] Could not find UUID for $device. Would skip." + echo "$line" >> "$staging_dir/modified/fstab" + fi + else + echo "$line" >> "$staging_dir/modified/fstab" + fi + else + echo "$line" >> "$staging_dir/modified/fstab" + fi + done < /etc/fstab + + diff -u "$staging_dir/original/fstab" "$staging_dir/modified/fstab" > "$staging_dir/diffs/fstab.diff" 2>&1 || true + echo "[DRYRUN] Fstab diff staged in $staging_dir/diffs/fstab.diff" + cat "$staging_dir/diffs/fstab.diff" + else + echo "[INFO] Replacing deprecated device names in /etc/fstab with UUIDs..." + + # Create a backup of the fstab file + cp /etc/fstab /etc/fstab.bak + + # Ensure fstab.new starts fresh (avoid stale leftovers from interrupted runs) + rm -f /etc/fstab.new + + # Use sed to replace device names with UUIDs + while read -r line; do + if [[ "$line" =~ ^[^#] ]]; then + device=$(echo "$line" | awk '{print $1}') + if [[ "$device" =~ ^/dev/sd[a-z][0-9]*$ ]]; then + uuid=$(blkid "$device" | awk -F\" '/UUID=/ {print $2}') + if [ -n "$uuid" ]; then + newline=$(echo "$line" | sed "s|$device|UUID=$uuid|g") + echo "[INFO] Replaced $device with UUID=$uuid" + echo "$newline" >> /etc/fstab.new + else + echo "[WARNING] Could not find UUID for $device. Skipping." + echo "$line" >> /etc/fstab.new + fi + elif [[ "$device" =~ ^/dev/disk/azure/scsi[0-9]*/lun[0-9]* ]]; then + uuid=$(blkid "$device" | awk -F\" '/UUID=/ {print $2}') + if [ -n "$uuid" ]; then + newline=$(echo "$line" | sed "s|$device|UUID=$uuid|g") + echo "[INFO] Replaced $device with UUID=$uuid" + echo "$newline" >> /etc/fstab.new + else + echo "[WARNING] Could not find UUID for $device. Skipping." + echo "$line" >> /etc/fstab.new + fi + else + echo "$line" >> /etc/fstab.new + fi + else + echo "$line" >> /etc/fstab.new + fi + done < /etc/fstab + + # Replace the old fstab with the new fstab + mv /etc/fstab.new /etc/fstab + + echo "[INFO] /etc/fstab updated with UUIDs. Original fstab backed up to /etc/fstab.bak" + fi + else + echo "[ERROR] /etc/fstab contains device names causing issues switching to NVMe" + fi + else + echo "[INFO] /etc/fstab does not contain deprecated device names." + fi +} + +# Run the checks +check_azure_vm_utils +check_nvme_driver +check_nvme_timeout +check_fstab + +# Generate dry-run summary report +if $dry_run && $fix; then + echo "" + echo "[DRYRUN] ============================================" + echo "[DRYRUN] Summary report for $(hostname)" + echo "[DRYRUN] Distro: $distro | Kernel: $(uname -r)" + echo "[DRYRUN] Staging directory: $staging_dir" + echo "[DRYRUN] ============================================" + echo "[DRYRUN] Files in staging directory:" + find "$staging_dir" -type f | sort | while read -r f; do + echo "[DRYRUN] $f" + done + echo "[DRYRUN] ============================================" + echo "[DRYRUN] No system files were modified." +fi + +exit 0 +""" diff --git a/src/nvme-conversion/azext_nvme_conversion/_params.py b/src/nvme-conversion/azext_nvme_conversion/_params.py new file mode 100755 index 00000000000..1e115eee734 --- /dev/null +++ b/src/nvme-conversion/azext_nvme_conversion/_params.py @@ -0,0 +1,65 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +"""Parameter definitions for nvme-conversion commands.""" + +from azure.cli.core.commands.parameters import get_enum_type +from azext_nvme_conversion._validators import validate_vm_size, validate_sleep_seconds + + +def load_arguments(self, _): + """Define arguments for nvme-conversion convert and check commands.""" + + with self.argument_context('nvme-conversion') as c: + c.argument('resource_group_name', options_list=['--resource-group', '-g'], + help='Name of the resource group containing the VM.') + c.argument('vm_name', options_list=['--vm-name', '-n'], + help='Name of the Virtual Machine to convert or check.') + c.argument('new_controller_type', options_list=['--controller-type'], + arg_type=get_enum_type(['NVMe', 'SCSI']), + default=None, + help='Target disk controller type. If omitted, automatically toggles ' + 'to the opposite of the current type (SCSI becomes NVMe, NVMe becomes SCSI).') + c.argument('vm_size', options_list=['--vm-size'], + validator=validate_vm_size, + default=None, + help='Target VM size/SKU (e.g., Standard_E4bds_v5). If omitted, keeps the current ' + 'VM size when it supports the target controller type. Required when the current ' + 'size does not support the target controller.') + c.argument('ignore_sku_check', options_list=['--ignore-sku-check'], + action='store_true', default=False, + help='Skip SKU capability validation. Use when you know the target SKU is valid ' + 'but the API does not advertise DiskControllerTypes.') + c.argument('ignore_os_check', options_list=['--ignore-os-check'], + action='store_true', default=False, + help='Skip OS readiness checks (RunCommand). Faster, but will not detect ' + 'missing NVMe drivers or incorrect fstab/grub settings.') + c.argument('ignore_windows_version_check', options_list=['--ignore-windows-version-check', '--ignore-win-ver'], + action='store_true', default=False, + help='Skip the Windows Server version check. NVMe requires Windows Server 2019 or later.') + + with self.argument_context('nvme-conversion convert') as c: + c.argument('yes', options_list=['--yes', '-y'], + action='store_true', default=False, + help='Do not prompt for confirmation.') + c.argument('start_vm', options_list=['--start-vm'], + action='store_true', default=False, + help='Start the VM after conversion completes. If not specified, the VM remains deallocated.') + c.argument('fix_os', options_list=['--fix-os'], + action='store_true', default=False, + help='Automatically fix OS settings for NVMe readiness. ' + 'On Windows, sets the stornvme driver to boot start. ' + 'On Linux, rebuilds initramfs, sets grub io_timeout, fixes fstab device names, ' + 'and installs fallback udev rules if azure-vm-utils is not present.') + c.argument('dry_run', options_list=['--dry-run'], + action='store_true', default=False, + help='Linux only: run all OS checks and stage proposed changes in ' + '/tmp/nvme-conversion-dryrun/ on the VM without modifying system files ' + 'or performing the conversion. Useful for validating changes before applying.') + c.argument('sleep_seconds', options_list=['--sleep-seconds'], + type=int, default=15, + validator=validate_sleep_seconds, + help='Seconds to wait after conversion before starting the VM (default: 15). ' + 'Allows Azure to settle disk and controller changes.') diff --git a/src/nvme-conversion/azext_nvme_conversion/_validators.py b/src/nvme-conversion/azext_nvme_conversion/_validators.py new file mode 100755 index 00000000000..a9ce2525969 --- /dev/null +++ b/src/nvme-conversion/azext_nvme_conversion/_validators.py @@ -0,0 +1,32 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +"""Parameter validators for nvme-conversion commands.""" + +import re + +from azure.cli.core.azclierror import InvalidArgumentValueError + + +def validate_vm_size(namespace): + """Validate that --vm-size looks like a valid Azure VM SKU.""" + vm_size = namespace.vm_size + if not vm_size: + return + if not re.match(r'^Standard_\w+', vm_size): + raise InvalidArgumentValueError( + f'VM size "{vm_size}" does not appear to be a valid Azure VM SKU. ' + 'Expected format: Standard_ (e.g. Standard_E4bds_v5).') + + +def validate_sleep_seconds(namespace): + """Validate that --sleep-seconds is a reasonable positive value.""" + if hasattr(namespace, 'sleep_seconds') and namespace.sleep_seconds is not None: + if namespace.sleep_seconds < 0: + raise InvalidArgumentValueError( + '--sleep-seconds must be a non-negative integer.') + if namespace.sleep_seconds > 600: + raise InvalidArgumentValueError( + '--sleep-seconds should not exceed 600 seconds (10 minutes).') diff --git a/src/nvme-conversion/azext_nvme_conversion/_windows_checks.py b/src/nvme-conversion/azext_nvme_conversion/_windows_checks.py new file mode 100755 index 00000000000..55c4e8725b7 --- /dev/null +++ b/src/nvme-conversion/azext_nvme_conversion/_windows_checks.py @@ -0,0 +1,92 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +"""Windows-specific OS checks and fixes for NVMe conversion. + +Steps performed: + 1. check_windows_version — Verify Windows Server >= 2019 (or Win10 >= 1809) + 2. prepare_windows — Check/fix stornvme driver via RunCommand + +To add a new Windows check: + - Define a function here following the same pattern + - Call it from prepare_windows() or add it to the check sequence in custom.py +""" + +import logging +import re + +from azure.cli.core.azclierror import ValidationError + +logger = logging.getLogger(__name__) + + +def check_windows_version(vm): + """Check that Windows version is 2019 or higher. + + NVMe controllers require Windows Server 2019+ or Windows 10 1809+. + The version is extracted from the VM image reference SKU. + """ + if vm.storage_profile.image_reference and \ + vm.storage_profile.image_reference.publisher == 'MicrosoftWindowsServer': + sku = vm.storage_profile.image_reference.sku or '' + version_match = re.search(r'(20\d{2})', sku) + if version_match and int(version_match.group(1)) < 2019: + raise ValidationError( + f'Windows version {sku} is lower than 2019. ' + 'NVMe controller is only supported on Windows Server 2019 and higher.') + logger.warning('Windows version: %s', sku) + + +def prepare_windows(compute_client, resource_group_name, vm_name, fix_os): + """Check/fix Windows stornvme driver settings via RunCommand. + + Checks: + - stornvme driver Start registry value must be 0 (boot start) + - StartOverride key must not exist + + If fix_os is True, sets stornvme to boot start automatically. + """ + from azure.mgmt.compute.models import RunCommandInput + + if fix_os: + logger.warning('Fixing Windows stornvme driver settings...') + script = ( + 'Start-Process -FilePath "C:\\Windows\\System32\\sc.exe" ' + '-ArgumentList "config stornvme start=boot"' + ) + run_input = RunCommandInput(command_id='RunPowerShellScript', script=[script]) + compute_client.virtual_machines.begin_run_command( + resource_group_name, vm_name, run_input).result(timeout=600) + logger.warning('Windows stornvme driver set to boot start.') + else: + logger.warning('Checking Windows stornvme driver settings...') + check_script = [ + '$start = (Get-ItemProperty -Path ' + 'HKLM:\\SYSTEM\\CurrentControlSet\\Services\\stornvme -Name Start).Start', + 'if ($start -eq 0) { Write-Host "Start:OK" } ' + 'else { Write-Host "Start:ERROR" }', + '$so = Get-ItemProperty -Path ' + 'HKLM:\\SYSTEM\\CurrentControlSet\\Services\\stornvme\\StartOverride ' + '-ErrorAction SilentlyContinue', + 'if ($so) { Write-Host "StartOverride:ERROR" } ' + 'else { Write-Host "StartOverride:OK" }', + ] + run_input = RunCommandInput(command_id='RunPowerShellScript', script=check_script) + result = compute_client.virtual_machines.begin_run_command( + resource_group_name, vm_name, run_input).result(timeout=600) + + errors = [] + for output in (result.value or []): + message = output.message or '' + for line in message.split('\n'): + if 'ERROR' in line: + errors.append(line.strip()) + logger.warning(' OS check: %s', line.strip()) + + if errors: + raise ValidationError( + 'Windows OS is not ready for NVMe. Issues found:\n' + + '\n'.join(f' - {e}' for e in errors) + + '\nUse --fix-os to automatically fix these issues.') diff --git a/src/nvme-conversion/azext_nvme_conversion/azext_metadata.json b/src/nvme-conversion/azext_nvme_conversion/azext_metadata.json new file mode 100755 index 00000000000..02c4e12c2cf --- /dev/null +++ b/src/nvme-conversion/azext_nvme_conversion/azext_metadata.json @@ -0,0 +1,4 @@ +{ + "azext.isPreview": true, + "azext.minCliCoreVersion": "2.70.0" +} diff --git a/src/nvme-conversion/azext_nvme_conversion/commands.py b/src/nvme-conversion/azext_nvme_conversion/commands.py new file mode 100755 index 00000000000..f3a7559c99b --- /dev/null +++ b/src/nvme-conversion/azext_nvme_conversion/commands.py @@ -0,0 +1,18 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +"""Command registration for nvme-conversion extension.""" + + +def load_command_table(self, _): + """Register nvme-conversion commands with formatters and options.""" + from azext_nvme_conversion._format import check_result_table_format, convert_result_table_format + + with self.command_group('nvme-conversion', client_factory=None) as g: + g.custom_command('convert', 'nvme_conversion_convert', + supports_no_wait=True, + table_transformer=convert_result_table_format) + g.custom_command('check', 'nvme_conversion_check', + table_transformer=check_result_table_format) diff --git a/src/nvme-conversion/azext_nvme_conversion/custom.py b/src/nvme-conversion/azext_nvme_conversion/custom.py new file mode 100755 index 00000000000..bca2c53288e --- /dev/null +++ b/src/nvme-conversion/azext_nvme_conversion/custom.py @@ -0,0 +1,615 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +"""Core conversion logic for nvme-conversion extension. + +Orchestrates the full SCSI<->NVMe conversion flow: + 1. Validate VM (exists, Gen2, no ADE, power state) + 2. Resolve target controller type and VM size + 3. Validate SKU capabilities + 4. Check/fix OS readiness via RunCommand + 5. Deallocate VM, update disk capabilities, update VM, optionally start +""" + +import logging +import sys +from azure.cli.core.azclierror import ( + InvalidArgumentValueError, + ResourceNotFoundError, + ValidationError, +) + +logger = logging.getLogger(__name__) + + +def _status(msg): + """Print a status message to stderr with immediate flush.""" + print(msg, file=sys.stderr, flush=True) + + +def nvme_conversion_convert(cmd, resource_group_name, vm_name, vm_size=None, + new_controller_type=None, + start_vm=False, + fix_os=False, + dry_run=False, + ignore_sku_check=False, + ignore_os_check=False, + ignore_windows_version_check=False, + sleep_seconds=15, + no_wait=False, + yes=False): + """Convert a VM's disk controller between SCSI and NVMe.""" + from azext_nvme_conversion._client_factory import cf_compute + + compute_client = cf_compute(cmd.cli_ctx) + + _status(f'[1/8] Validating VM {vm_name}...') + + # Phase 1: Validation + vm = _validate_vm(compute_client, resource_group_name, vm_name) + os_type = _detect_os_type(vm) + original_vm_size = vm.hardware_profile.vm_size + + # Auto-detect target controller type if not specified + new_controller_type = _resolve_controller_type(vm, new_controller_type) + if new_controller_type is None: + return {'status': 'no-change', 'vm': vm_name, 'message': 'VM is already on the desired controller type.'} + + # Resolve SKU list once when SKU-based validation/resolution is enabled. + vm_skus = _get_vm_skus(compute_client, vm.location) if not ignore_sku_check else None + + # Resolve VM size: use current if not specified and current supports the target controller + _status('[2/8] Resolving VM size...') + vm_size = _resolve_vm_size(compute_client, vm, vm_size, new_controller_type, + ignore_sku_check, vm_skus=vm_skus) + + _status(f' Target: {new_controller_type}, Size: {vm_size}') + + # Dynamic confirmation prompt with VM-specific context + if not yes: + from knack.prompting import prompt_y_n + if not prompt_y_n( + f'This will deallocate VM \'{vm_name}\', change its disk controller to {new_controller_type}, ' + f'and resize to {vm_size}' + + ('. The VM will be restarted after conversion.' if start_vm else + '. The VM will remain deallocated after conversion.') + + ' Continue?', default='n'): + raise SystemExit('Aborted by user.') + + _status('[3/8] Checking prerequisites (ADE, generation, power state)...') + _check_ade_extension(compute_client, resource_group_name, vm_name, os_type) + _check_vm_generation(compute_client, vm) + + vm_is_running = _check_vm_power_state(compute_client, resource_group_name, vm_name, fix_os) + + if os_type == 'Windows' and not ignore_windows_version_check: + _check_windows_version(vm) + + # Phase 2: SKU validation + if not ignore_sku_check: + _status('[4/8] Validating SKU capabilities (this may take a moment)...') + _validate_sku(compute_client, vm, vm_size, new_controller_type, os_type, + original_vm_size, vm_skus=vm_skus) + else: + _status('[4/8] SKU validation skipped.') + + # Phase 3: OS preparation (requires running VM) + if not ignore_os_check and vm_is_running: + _status('[5/8] Checking OS readiness via RunCommand (60-120s)...') + _prepare_os(compute_client, resource_group_name, vm_name, os_type, + new_controller_type, fix_os, dry_run) + elif not vm_is_running: + _status('[5/8] OS readiness check skipped (VM is not running).') + else: + _status('[5/8] OS readiness check skipped.') + + # Dry-run stops here + if dry_run: + _status('Dry-run complete. No VM changes were made.') + return {'status': 'dry-run-complete', 'vm': vm_name} + + # Phase 4: Conversion + if vm_is_running: + _status(f'[6/8] Shutting down VM {vm_name} (1-3 minutes)...') + _stop_vm(compute_client, resource_group_name, vm_name) + else: + _status(f'[6/8] VM {vm_name} is already stopped. Skipping shutdown.') + + _status('[7/8] Updating OS disk capabilities and VM size...') + _update_disk_capabilities(compute_client, vm, new_controller_type) + _update_vm(compute_client, resource_group_name, vm, vm_size, new_controller_type) + + # Phase 5: Start VM + if start_vm: + if no_wait: + _status(f'[8/8] Starting VM {vm_name} (--no-wait)...') + _start_vm_no_wait(compute_client, resource_group_name, vm_name) + else: + _status(f'[8/8] Waiting {sleep_seconds}s then starting VM {vm_name}...') + import time + time.sleep(sleep_seconds) + _start_vm(compute_client, resource_group_name, vm_name) + _status(f' VM {vm_name} started.') + else: + _status('[8/8] VM not started (use --start-vm to start automatically).') + + result = { + 'status': 'succeeded', + 'vm': vm_name, + 'resourceGroup': resource_group_name, + 'previousSize': original_vm_size, + 'newSize': vm_size, + 'controllerType': new_controller_type, + 'vmStarted': start_vm, + } + + if new_controller_type == 'NVMe': + revert_cmd = ( + f'az nvme-conversion convert --resource-group {resource_group_name} ' + f'--vm-name {vm_name} --controller-type SCSI ' + f'--vm-size {original_vm_size} --start-vm --yes' + ) + logger.warning('To revert: %s', revert_cmd) + result['revertCommand'] = revert_cmd + + return result + + +def nvme_conversion_check(cmd, resource_group_name, vm_name, vm_size=None, + new_controller_type=None, + ignore_sku_check=False, + ignore_os_check=False, + ignore_windows_version_check=False): + """Check VM readiness for NVMe conversion without making changes.""" + from azext_nvme_conversion._client_factory import cf_compute + + compute_client = cf_compute(cmd.cli_ctx) + + _status('[1/7] Checking VM exists...') + + # VM exists + try: + vm = _validate_vm(compute_client, resource_group_name, vm_name) + except (ResourceNotFoundError, ValidationError) as e: + return { + 'vm': vm_name, + 'resourceGroup': resource_group_name, + 'checks': {'vmExists': {'status': 'failed', 'message': str(e)}}, + 'overallStatus': 'failed', + } + + os_type = _detect_os_type(vm) + new_controller_type = _resolve_controller_type(vm, new_controller_type) + + # Resolve SKU list once for both _resolve_vm_size and _validate_sku + vm_skus = None + if new_controller_type is not None and (not ignore_sku_check or not vm_size): + try: + vm_skus = _get_vm_skus(compute_client, vm.location) + except Exception: # pylint: disable=broad-exception-caught + vm_skus = None + + # Resolve VM size for the check + resolved_vm_size = vm_size + if new_controller_type is not None: + try: + resolved_vm_size = _resolve_vm_size( + compute_client, vm, vm_size, new_controller_type, ignore_sku_check, + vm_skus=vm_skus) + except (InvalidArgumentValueError, ValidationError): + resolved_vm_size = vm_size or vm.hardware_profile.vm_size + + results = { + 'vm': vm_name, + 'resourceGroup': resource_group_name, + 'currentControllerType': _get_current_controller(vm), + 'targetControllerType': new_controller_type or _get_current_controller(vm), + 'targetVmSize': resolved_vm_size, + 'osType': os_type, + 'currentSize': vm.hardware_profile.vm_size, + 'checks': {'vmExists': {'status': 'passed'}}, + } + + if new_controller_type is None: + results['checks']['controllerCheck'] = { + 'status': 'info', + 'message': f'VM is already running {_get_current_controller(vm)}. No conversion needed.' + } + results['overallStatus'] = 'passed' + _status(f' VM is already running {_get_current_controller(vm)}. No conversion needed.') + return results + + results['checks']['controllerCheck'] = {'status': 'passed'} + + # ADE check + _status('[2/7] Checking ADE extension...') + try: + _check_ade_extension(compute_client, resource_group_name, vm_name, os_type) + results['checks']['adeCheck'] = {'status': 'passed'} + except ValidationError as e: + results['checks']['adeCheck'] = {'status': 'failed', 'message': str(e)} + + # Generation check + _status('[3/7] Checking VM generation...') + try: + _check_vm_generation(compute_client, vm) + results['checks']['generationCheck'] = {'status': 'passed'} + except ValidationError as e: + results['checks']['generationCheck'] = {'status': 'failed', 'message': str(e)} + + # Power state + vm_is_running = True + if not ignore_os_check: + _status('[4/7] Checking power state...') + try: + vm_is_running = _check_vm_power_state(compute_client, resource_group_name, vm_name, False) + results['checks']['powerState'] = { + 'status': 'passed' if vm_is_running else 'info', + 'message': '' if vm_is_running else 'VM is not running. OS checks skipped.' + } + except ValidationError as e: + results['checks']['powerState'] = {'status': 'failed', 'message': str(e)} + + # Windows version + if os_type == 'Windows' and not ignore_windows_version_check: + _status('[5/7] Checking Windows version...') + try: + _check_windows_version(vm) + results['checks']['windowsVersion'] = {'status': 'passed'} + except ValidationError as e: + results['checks']['windowsVersion'] = {'status': 'failed', 'message': str(e)} + + # SKU validation + if not ignore_sku_check and resolved_vm_size: + _status('[6/7] Validating SKU capabilities (this may take a moment)...') + try: + _validate_sku(compute_client, vm, resolved_vm_size, new_controller_type, + os_type, vm.hardware_profile.vm_size, vm_skus=vm_skus) + results['checks']['skuValidation'] = {'status': 'passed'} + except (ValidationError, InvalidArgumentValueError) as e: + results['checks']['skuValidation'] = {'status': 'failed', 'message': str(e)} + + # OS readiness (check only, no fix — requires running VM) + if not ignore_os_check and vm_is_running: + _status('[6/7] Checking OS readiness via RunCommand (60-120s)...') + try: + _check_os_readiness(compute_client, resource_group_name, vm_name, os_type, new_controller_type) + results['checks']['osReadiness'] = {'status': 'passed'} + except ValidationError as e: + results['checks']['osReadiness'] = {'status': 'failed', 'message': str(e)} + + failed = [k for k, v in results['checks'].items() if v['status'] == 'failed'] + results['overallStatus'] = 'failed' if failed else 'passed' + + _status(f'[7/7] Done. Overall: {results["overallStatus"]}') + + return results + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + +def _validate_vm(compute_client, resource_group_name, vm_name): + """Get VM and raise if not found or unsupported.""" + try: + vm = compute_client.virtual_machines.get(resource_group_name, vm_name) + except Exception as e: + raise ResourceNotFoundError( + f'VM {vm_name} not found in resource group {resource_group_name}: {e}') from e + if not vm.storage_profile.os_disk.managed_disk: + raise ValidationError( + f'VM {vm_name} uses an unmanaged (page-blob) OS disk. ' + 'Convert to a managed disk before running nvme-conversion.') + return vm + + +def _get_vm_skus(compute_client, location): + """Resolve and cache the list of virtualMachines SKUs for a location.""" + skus = list(compute_client.resource_skus.list(filter=f"location eq '{location}'")) + return [s for s in skus if s.resource_type == 'virtualMachines'] + + +def _detect_os_type(vm): + """Detect if the VM is running Windows or Linux.""" + if vm.storage_profile.os_disk.os_type and str(vm.storage_profile.os_disk.os_type).lower() == 'windows': + return 'Windows' + return 'Linux' + + +def _check_ade_extension(compute_client, resource_group_name, vm_name, os_type): + """Block conversion if Azure Disk Encryption for Linux is installed.""" + if os_type != 'Linux': + return + try: + ext = compute_client.virtual_machine_extensions.get( + resource_group_name, vm_name, 'AzureDiskEncryptionForLinux') + if ext and ext.provisioning_state == 'Succeeded': + raise ValidationError( + f'Azure Disk Encryption for Linux is installed on VM {vm_name}. ' + 'ADE does not support NVMe disks. Remove the extension first.') + if ext: + raise ValidationError( + f'Azure Disk Encryption for Linux extension is installed but provisioning state is: ' + f'{ext.provisioning_state}. Remove the extension if the VM has not been encrypted.') + except ValidationError: + raise + except Exception: # pylint: disable=broad-exception-caught + # Extension not found — this is the expected case + pass + + +def _check_vm_power_state(compute_client, resource_group_name, vm_name, fix_os): + """Verify the VM is running (required for OS checks). Returns True if running.""" + vm_status = compute_client.virtual_machines.instance_view(resource_group_name, vm_name) + power_state = None + for status in vm_status.statuses: + if status.code.startswith('PowerState/'): + power_state = status.code + break + + if power_state != 'PowerState/running': + if fix_os: + raise ValidationError( + f'VM {vm_name} is not running (state: {power_state}). ' + 'The VM must be running to fix OS settings.') + _status(f' VM is not running (state: {power_state}). OS checks will be skipped.') + return False + return True + + +def _check_vm_generation(compute_client, vm): + """Verify the VM is running a Gen2 image.""" + disk_rg = vm.storage_profile.os_disk.managed_disk.id.split('/')[4] + os_disk = compute_client.disks.get(disk_rg, vm.storage_profile.os_disk.name) + if os_disk.hyper_v_generation == 'V1': + raise ValidationError( + 'VM is running a Generation 1 image. ' + 'NVMe controllers are only supported on Generation 2 images.') + logger.info('VM is running a Generation 2 image.') + + +def _get_current_controller(vm): + """Return the current controller type label (SCSI or NVMe).""" + current = vm.storage_profile.disk_controller_type + if not current or current == 'SCSI': + return 'SCSI' + return str(current) + + +def _resolve_controller_type(vm, requested_type): + """Resolve the target controller type. + + If requested_type is None, auto-toggle to the opposite. + If already on the requested (or auto-detected) type, return None to signal no-op. + """ + current = _get_current_controller(vm) + + if requested_type is None: + # Auto-toggle + target = 'NVMe' if current == 'SCSI' else 'SCSI' + else: + target = requested_type + + if current == target: + logger.info('VM is already running %s. No conversion needed.', current) + return None + + logger.info('Current controller: %s -> Target: %s', current, target) + return target + + +def _resolve_vm_size(compute_client, vm, requested_size, new_controller_type, + ignore_sku_check, vm_skus=None): + """Resolve the target VM size. + + If requested_size is provided, return it as-is. + If not, check whether the current VM size supports the target controller type. + If it does, reuse the current size. If not, raise an error asking the user to specify --vm-size. + """ + current_size = vm.hardware_profile.vm_size + + if requested_size: + return requested_size + + if ignore_sku_check: + logger.warning('No --vm-size specified and SKU check is skipped. Using current size: %s', current_size) + return current_size + + # Check if current size supports the target controller + if vm_skus is None: + vm_skus = _get_vm_skus(compute_client, vm.location) + current_sku = next((s for s in vm_skus if s.name == current_size), None) + + if not current_sku: + raise InvalidArgumentValueError( + f'Current VM size {current_size} not found in SKU list for location {vm.location}. ' + 'Please specify --vm-size explicitly.') + + supported_controllers = None + for cap in (current_sku.capabilities or []): + if cap.name == 'DiskControllerTypes': + supported_controllers = cap.value + break + + if supported_controllers is None: + # DiskControllerTypes absent = SCSI-only SKU + if new_controller_type == 'NVMe': + raise InvalidArgumentValueError( + f'Current VM size {current_size} does not support NVMe ' + '(no DiskControllerTypes capability in SKU API — this means SCSI only). ' + 'You must specify --vm-size with an NVMe-capable SKU (e.g. Standard_E4bds_v5, Standard_D2s_v6).') + _status(f' Current VM size {current_size} is SCSI-only. Keeping same size.') + return current_size + + if new_controller_type == 'NVMe' and 'NVMe' not in supported_controllers: + raise InvalidArgumentValueError( + f'Current VM size {current_size} does not support NVMe. ' + 'You must specify --vm-size with an NVMe-capable SKU (e.g. Standard_E4bds_v5).') + + if new_controller_type == 'SCSI' and 'SCSI' not in supported_controllers: + raise InvalidArgumentValueError( + f'Current VM size {current_size} does not support SCSI. ' + 'You must specify --vm-size with a SCSI-capable SKU.') + + logger.info('Current VM size %s supports %s. Keeping same size.', current_size, new_controller_type) + return current_size + + +def _check_windows_version(vm): + """Check that Windows version is 2019 or higher.""" + from azext_nvme_conversion._windows_checks import check_windows_version + check_windows_version(vm) + + +def _validate_sku(compute_client, vm, vm_size, new_controller_type, os_type, + original_vm_size, vm_skus=None): + """Validate target SKU exists, is available in the VM's zone(s), and supports the controller.""" + logger.info('Validating SKU %s...', vm_size) + + if vm_skus is None: + vm_skus = _get_vm_skus(compute_client, vm.location) + target_sku = next((s for s in vm_skus if s.name == vm_size), None) + + if not target_sku: + raise InvalidArgumentValueError(f'VM SKU {vm_size} does not exist. Check your input.') + + # Zone availability — must be available in every zone the VM is pinned to + if vm.zones: + sku_zones = set() + for loc_info in (target_sku.location_info or []): + sku_zones.update(loc_info.zones or []) + missing = [z for z in vm.zones if z not in sku_zones] + if missing: + raise InvalidArgumentValueError( + f'VM SKU {vm_size} is not available in zone(s) {",".join(missing)} ' + f'(VM zones: {",".join(vm.zones)}).') + logger.info('SKU %s is available in zone(s) %s.', vm_size, ','.join(vm.zones)) + + # Resource disk compatibility (Windows only) + if os_type == 'Windows': + def _has_resource_disk(sku): + for cap in (sku.capabilities or []): + if cap.name == 'MaxResourceVolumeMB' and cap.value == '0': + return False + return True + + original_sku = next((s for s in vm_skus if s.name == original_vm_size), None) + if original_sku: + orig_has = _has_resource_disk(original_sku) + new_has = _has_resource_disk(target_sku) + if orig_has != new_has: + raise ValidationError( + f'Mismatch in resource disk support between original VM size ' + f'({original_vm_size}) and new VM size ({vm_size}).') + + # Controller support + supported_controllers = None + for cap in (target_sku.capabilities or []): + if cap.name == 'DiskControllerTypes': + supported_controllers = cap.value + break + + if supported_controllers is None: + # DiskControllerTypes absent in SKU API = SCSI-only + if new_controller_type == 'NVMe': + raise InvalidArgumentValueError( + f'VM SKU {vm_size} does not support NVMe ' + '(no DiskControllerTypes capability — SCSI only). ' + 'Use an NVMe-capable SKU (e.g. Standard_E4bds_v5, Standard_D2s_v6).') + _status(f' SKU {vm_size} is SCSI-only (no DiskControllerTypes). OK for SCSI target.') + elif new_controller_type == 'NVMe' and 'NVMe' not in supported_controllers: + raise InvalidArgumentValueError(f'VM SKU {vm_size} does not support NVMe.') + elif new_controller_type == 'SCSI' and 'SCSI' not in supported_controllers: + raise InvalidArgumentValueError(f'VM SKU {vm_size} does not support SCSI.') + else: + logger.info('SKU %s supports %s.', vm_size, new_controller_type) + + +def _prepare_os(compute_client, resource_group_name, vm_name, os_type, + new_controller_type, fix_os, dry_run): + """Run OS readiness checks and optionally fix issues. + + When fix_os=False and dry_run=False this is a read-only check + (used by both `convert` and `check` commands). + """ + if new_controller_type != 'NVMe': + if fix_os or dry_run: + logger.info('No OS preparation required for SCSI.') + return + + if os_type == 'Windows': + from azext_nvme_conversion._windows_checks import prepare_windows + prepare_windows(compute_client, resource_group_name, vm_name, fix_os) + else: + from azext_nvme_conversion._linux_checks import prepare_linux + prepare_linux(compute_client, resource_group_name, vm_name, fix_os, dry_run) + + +def _check_os_readiness(compute_client, resource_group_name, vm_name, os_type, new_controller_type): + """Check OS readiness without fixing (thin wrapper around _prepare_os).""" + _prepare_os(compute_client, resource_group_name, vm_name, os_type, + new_controller_type, fix_os=False, dry_run=False) + + +def _stop_vm(compute_client, resource_group_name, vm_name): + """Deallocate the VM.""" + poller = compute_client.virtual_machines.begin_deallocate(resource_group_name, vm_name) + poller.result() + logger.info('VM %s deallocated.', vm_name) + + # Verify deallocated + vm_status = compute_client.virtual_machines.instance_view(resource_group_name, vm_name) + for status in vm_status.statuses: + if status.code == 'PowerState/deallocated': + return + raise ValidationError(f'VM {vm_name} is not deallocated after stop. Check the VM status.') + + +def _update_disk_capabilities(compute_client, vm, new_controller_type): + """Update the OS disk supportedCapabilities to allow the new controller type.""" + from azure.mgmt.compute.models import DiskUpdate, SupportedCapabilities + + disk_rg = vm.storage_profile.os_disk.managed_disk.id.split('/')[4] + disk_name = vm.storage_profile.os_disk.name + + if new_controller_type == 'NVMe': + controller_types = 'SCSI, NVMe' + else: + controller_types = 'SCSI' + + disk_update = DiskUpdate( + supported_capabilities=SupportedCapabilities( + disk_controller_types=controller_types + ) + ) + + poller = compute_client.disks.begin_update(disk_rg, disk_name, disk_update) + poller.result() + logger.info('OS disk %s updated with controller types: %s', disk_name, controller_types) + + +def _update_vm(compute_client, resource_group_name, vm, vm_size, new_controller_type): + """Update VM size and disk controller type.""" + vm.hardware_profile.vm_size = vm_size + vm.storage_profile.disk_controller_type = new_controller_type + + poller = compute_client.virtual_machines.begin_create_or_update(resource_group_name, vm.name, vm) + result = poller.result() + logger.info('VM %s updated to size %s with controller %s.', vm.name, vm_size, new_controller_type) + return result + + +def _start_vm(compute_client, resource_group_name, vm_name): + """Start the VM.""" + poller = compute_client.virtual_machines.begin_start(resource_group_name, vm_name) + poller.result() + logger.info('VM %s started.', vm_name) + + +def _start_vm_no_wait(compute_client, resource_group_name, vm_name): + """Start the VM without waiting for completion.""" + compute_client.virtual_machines.begin_start(resource_group_name, vm_name) + logger.info('VM %s start initiated (not waiting for completion).', vm_name) diff --git a/src/nvme-conversion/azext_nvme_conversion/tests/__init__.py b/src/nvme-conversion/azext_nvme_conversion/tests/__init__.py new file mode 100755 index 00000000000..34913fb394d --- /dev/null +++ b/src/nvme-conversion/azext_nvme_conversion/tests/__init__.py @@ -0,0 +1,4 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- diff --git a/src/nvme-conversion/azext_nvme_conversion/tests/latest/__init__.py b/src/nvme-conversion/azext_nvme_conversion/tests/latest/__init__.py new file mode 100755 index 00000000000..34913fb394d --- /dev/null +++ b/src/nvme-conversion/azext_nvme_conversion/tests/latest/__init__.py @@ -0,0 +1,4 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- diff --git a/src/nvme-conversion/azext_nvme_conversion/tests/latest/test_nvme_conversion.py b/src/nvme-conversion/azext_nvme_conversion/tests/latest/test_nvme_conversion.py new file mode 100755 index 00000000000..aa5e372546a --- /dev/null +++ b/src/nvme-conversion/azext_nvme_conversion/tests/latest/test_nvme_conversion.py @@ -0,0 +1,689 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +import unittest +from unittest.mock import MagicMock, patch +from azure.cli.testsdk import ScenarioTest + + +class NvmeConversionCheckTest(ScenarioTest): + """Scenario tests for nvme-conversion check command.""" + + @unittest.skip('Requires live Azure resources') + def test_nvme_conversion_check(self): + self.cmd('nvme-conversion check ' + '--resource-group {rg} ' + '--vm-name {vm} ' + '--vm-size Standard_E4bds_v5', + checks=[ + self.check('overallStatus', 'passed'), + self.check('vm', '{vm}'), + ]) + + +class NvmeConversionConvertTest(ScenarioTest): + """Scenario tests for nvme-conversion convert command.""" + + @unittest.skip('Requires live Azure resources') + def test_nvme_conversion_convert_scsi_to_nvme(self): + self.cmd('nvme-conversion convert ' + '--resource-group {rg} ' + '--vm-name {vm} ' + '--vm-size Standard_E4bds_v5 ' + '--start-vm ' + '--yes', + checks=[ + self.check('status', 'succeeded'), + self.check('controllerType', 'NVMe'), + ]) + + +class NvmeConversionUnitTests(unittest.TestCase): + """Unit tests for internal helper functions.""" + + def test_detect_os_type_windows(self): + from azext_nvme_conversion.custom import _detect_os_type + vm = MagicMock() + vm.storage_profile.os_disk.os_type = 'Windows' + self.assertEqual(_detect_os_type(vm), 'Windows') + + def test_detect_os_type_linux(self): + from azext_nvme_conversion.custom import _detect_os_type + vm = MagicMock() + vm.storage_profile.os_disk.os_type = 'Linux' + self.assertEqual(_detect_os_type(vm), 'Linux') + + def test_detect_os_type_none_defaults_linux(self): + from azext_nvme_conversion.custom import _detect_os_type + vm = MagicMock() + vm.storage_profile.os_disk.os_type = None + self.assertEqual(_detect_os_type(vm), 'Linux') + + def test_resolve_controller_scsi_to_nvme_auto(self): + from azext_nvme_conversion.custom import _resolve_controller_type + vm = MagicMock() + vm.storage_profile.disk_controller_type = 'SCSI' + self.assertEqual(_resolve_controller_type(vm, None), 'NVMe') + + def test_resolve_controller_nvme_to_scsi_auto(self): + from azext_nvme_conversion.custom import _resolve_controller_type + vm = MagicMock() + vm.storage_profile.disk_controller_type = 'NVMe' + self.assertEqual(_resolve_controller_type(vm, None), 'SCSI') + + def test_resolve_controller_already_on_target_returns_none(self): + from azext_nvme_conversion.custom import _resolve_controller_type + vm = MagicMock() + vm.storage_profile.disk_controller_type = 'NVMe' + self.assertIsNone(_resolve_controller_type(vm, 'NVMe')) + + def test_resolve_controller_null_treated_as_scsi(self): + from azext_nvme_conversion.custom import _resolve_controller_type + vm = MagicMock() + vm.storage_profile.disk_controller_type = None + # None is treated as SCSI, auto-toggle should pick NVMe + self.assertEqual(_resolve_controller_type(vm, None), 'NVMe') + + def test_resolve_controller_null_explicit_scsi_returns_none(self): + from azext_nvme_conversion.custom import _resolve_controller_type + vm = MagicMock() + vm.storage_profile.disk_controller_type = None + # None is SCSI, requesting SCSI should be no-op + self.assertIsNone(_resolve_controller_type(vm, 'SCSI')) + + def test_get_current_controller_scsi(self): + from azext_nvme_conversion.custom import _get_current_controller + vm = MagicMock() + vm.storage_profile.disk_controller_type = 'SCSI' + self.assertEqual(_get_current_controller(vm), 'SCSI') + + def test_get_current_controller_none_is_scsi(self): + from azext_nvme_conversion.custom import _get_current_controller + vm = MagicMock() + vm.storage_profile.disk_controller_type = None + self.assertEqual(_get_current_controller(vm), 'SCSI') + + def test_check_windows_version_ok(self): + from azext_nvme_conversion.custom import _check_windows_version + vm = MagicMock() + vm.storage_profile.image_reference.publisher = 'MicrosoftWindowsServer' + vm.storage_profile.image_reference.sku = '2022-datacenter-g2' + # Should not raise + _check_windows_version(vm) + + def test_check_windows_version_too_old(self): + from azext_nvme_conversion.custom import _check_windows_version + from azure.cli.core.azclierror import ValidationError + vm = MagicMock() + vm.storage_profile.image_reference.publisher = 'MicrosoftWindowsServer' + vm.storage_profile.image_reference.sku = '2016-Datacenter' + with self.assertRaises(ValidationError): + _check_windows_version(vm) + + def test_check_vm_generation_v1_blocked(self): + from azext_nvme_conversion.custom import _check_vm_generation + from azure.cli.core.azclierror import ValidationError + compute_client = MagicMock() + vm = MagicMock() + vm.storage_profile.os_disk.managed_disk.id = '/subscriptions/sub/resourceGroups/rg/providers/Microsoft.Compute/disks/osdisk' + vm.storage_profile.os_disk.name = 'osdisk' + disk = MagicMock() + disk.hyper_v_generation = 'V1' + compute_client.disks.get.return_value = disk + with self.assertRaises(ValidationError): + _check_vm_generation(compute_client, vm) + + def test_check_vm_generation_v2_passes(self): + from azext_nvme_conversion.custom import _check_vm_generation + compute_client = MagicMock() + vm = MagicMock() + vm.storage_profile.os_disk.managed_disk.id = '/subscriptions/sub/resourceGroups/rg/providers/Microsoft.Compute/disks/osdisk' + vm.storage_profile.os_disk.name = 'osdisk' + disk = MagicMock() + disk.hyper_v_generation = 'V2' + compute_client.disks.get.return_value = disk + # Should not raise + _check_vm_generation(compute_client, vm) + + def test_linux_script_not_empty(self): + from azext_nvme_conversion._linux_script import get_linux_check_script + script = get_linux_check_script() + self.assertIn('#!/bin/bash', script) + self.assertIn('check_nvme_driver', script) + self.assertIn('check_nvme_timeout', script) + self.assertIn('check_fstab', script) + + def test_linux_script_has_azure_vm_utils_check(self): + from azext_nvme_conversion._linux_script import get_linux_check_script + script = get_linux_check_script() + self.assertIn('check_azure_vm_utils', script) + self.assertIn('azure-nvme-id', script) + self.assertIn('80-azure-disk.rules', script) + + +class WindowsChecksUnitTests(unittest.TestCase): + """Unit tests for Windows OS checks.""" + + def test_prepare_windows_check_passes(self): + from azext_nvme_conversion._windows_checks import prepare_windows + compute_client = MagicMock() + output_value = MagicMock() + output_value.message = 'Start:OK\nStartOverride:OK\n' + result = MagicMock() + result.value = [output_value] + poller = MagicMock() + poller.result.return_value = result + compute_client.virtual_machines.begin_run_command.return_value = poller + # Should not raise + prepare_windows(compute_client, 'rg', 'vm', fix_os=False) + + def test_prepare_windows_check_fails_start_error(self): + from azext_nvme_conversion._windows_checks import prepare_windows + from azure.cli.core.azclierror import ValidationError + compute_client = MagicMock() + output_value = MagicMock() + output_value.message = 'Start:ERROR\nStartOverride:OK\n' + result = MagicMock() + result.value = [output_value] + poller = MagicMock() + poller.result.return_value = result + compute_client.virtual_machines.begin_run_command.return_value = poller + with self.assertRaises(ValidationError): + prepare_windows(compute_client, 'rg', 'vm', fix_os=False) + + def test_prepare_windows_check_fails_startoverride_error(self): + from azext_nvme_conversion._windows_checks import prepare_windows + from azure.cli.core.azclierror import ValidationError + compute_client = MagicMock() + output_value = MagicMock() + output_value.message = 'Start:OK\nStartOverride:ERROR\n' + result = MagicMock() + result.value = [output_value] + poller = MagicMock() + poller.result.return_value = result + compute_client.virtual_machines.begin_run_command.return_value = poller + with self.assertRaises(ValidationError): + prepare_windows(compute_client, 'rg', 'vm', fix_os=False) + + def test_prepare_windows_fix_calls_run_command(self): + from azext_nvme_conversion._windows_checks import prepare_windows + compute_client = MagicMock() + poller = MagicMock() + compute_client.virtual_machines.begin_run_command.return_value = poller + prepare_windows(compute_client, 'rg', 'vm', fix_os=True) + compute_client.virtual_machines.begin_run_command.assert_called_once() + call_args = compute_client.virtual_machines.begin_run_command.call_args + self.assertEqual(call_args[0][0], 'rg') + self.assertEqual(call_args[0][1], 'vm') + + def test_check_windows_version_2012_r2_blocked(self): + from azext_nvme_conversion._windows_checks import check_windows_version + from azure.cli.core.azclierror import ValidationError + vm = MagicMock() + vm.storage_profile.image_reference.publisher = 'MicrosoftWindowsServer' + vm.storage_profile.image_reference.sku = '2012-R2-Datacenter' + with self.assertRaises(ValidationError): + check_windows_version(vm) + + def test_check_windows_version_no_year_skips(self): + from azext_nvme_conversion._windows_checks import check_windows_version + vm = MagicMock() + vm.storage_profile.image_reference.publisher = 'MicrosoftWindowsServer' + vm.storage_profile.image_reference.sku = 'datacenter-core-smalldisk' + # No 4-digit year in SKU — should not raise + check_windows_version(vm) + + def test_check_windows_version_non_microsoft_publisher_skips(self): + from azext_nvme_conversion._windows_checks import check_windows_version + vm = MagicMock() + vm.storage_profile.image_reference.publisher = 'SomeOtherPublisher' + # Should not raise — non-Microsoft publishers are not checked + check_windows_version(vm) + + +class LinuxChecksUnitTests(unittest.TestCase): + """Unit tests for Linux OS checks.""" + + def test_prepare_linux_check_passes(self): + from azext_nvme_conversion._linux_checks import prepare_linux + compute_client = MagicMock() + output_value = MagicMock() + output_value.message = '[INFO] NVMe driver found\n[INFO] io_timeout set\n' + result = MagicMock() + result.value = [output_value] + poller = MagicMock() + poller.result.return_value = result + compute_client.virtual_machines.begin_run_command.return_value = poller + # Should not raise + prepare_linux(compute_client, 'rg', 'vm', fix_os=False, dry_run=False) + + def test_prepare_linux_check_fails_with_errors(self): + from azext_nvme_conversion._linux_checks import prepare_linux + from azure.cli.core.azclierror import ValidationError + compute_client = MagicMock() + output_value = MagicMock() + output_value.message = '[ERROR] NVMe driver not found in initrd/initramfs.\n' + result = MagicMock() + result.value = [output_value] + poller = MagicMock() + poller.result.return_value = result + compute_client.virtual_machines.begin_run_command.return_value = poller + with self.assertRaises(ValidationError): + prepare_linux(compute_client, 'rg', 'vm', fix_os=False, dry_run=False) + + def test_prepare_linux_fix_does_not_raise_on_errors(self): + from azext_nvme_conversion._linux_checks import prepare_linux + compute_client = MagicMock() + output_value = MagicMock() + output_value.message = '[ERROR] NVMe driver not found\n[INFO] Adding NVMe driver\n' + result = MagicMock() + result.value = [output_value] + poller = MagicMock() + poller.result.return_value = result + compute_client.virtual_machines.begin_run_command.return_value = poller + # With fix_os=True, should not raise even if errors are present + prepare_linux(compute_client, 'rg', 'vm', fix_os=True, dry_run=False) + + def test_prepare_linux_dryrun_does_not_raise_on_errors(self): + from azext_nvme_conversion._linux_checks import prepare_linux + compute_client = MagicMock() + output_value = MagicMock() + output_value.message = '[ERROR] fstab issue\n[DRYRUN] Staged fix\n' + result = MagicMock() + result.value = [output_value] + poller = MagicMock() + poller.result.return_value = result + compute_client.virtual_machines.begin_run_command.return_value = poller + # With dry_run=True, should not raise + prepare_linux(compute_client, 'rg', 'vm', fix_os=False, dry_run=True) + + def test_prepare_linux_passes_fix_flag(self): + from azext_nvme_conversion._linux_checks import prepare_linux + compute_client = MagicMock() + poller = MagicMock() + result = MagicMock() + result.value = [] + poller.result.return_value = result + compute_client.virtual_machines.begin_run_command.return_value = poller + prepare_linux(compute_client, 'rg', 'vm', fix_os=True, dry_run=False) + call_args = compute_client.virtual_machines.begin_run_command.call_args + run_input = call_args[0][2] + self.assertIsNotNone(run_input.parameters) + + +class ValidatorUnitTests(unittest.TestCase): + """Unit tests for parameter validators.""" + + def test_validate_vm_size_valid(self): + from azext_nvme_conversion._validators import validate_vm_size + ns = MagicMock() + ns.vm_size = 'Standard_E4bds_v5' + validate_vm_size(ns) + + def test_validate_vm_size_invalid(self): + from azext_nvme_conversion._validators import validate_vm_size + from azure.cli.core.azclierror import InvalidArgumentValueError + ns = MagicMock() + ns.vm_size = 'InvalidSize' + with self.assertRaises(InvalidArgumentValueError): + validate_vm_size(ns) + + def test_validate_vm_size_none_skips(self): + from azext_nvme_conversion._validators import validate_vm_size + ns = MagicMock() + ns.vm_size = None + validate_vm_size(ns) + + def test_validate_sleep_seconds_valid(self): + from azext_nvme_conversion._validators import validate_sleep_seconds + ns = MagicMock() + ns.sleep_seconds = 15 + validate_sleep_seconds(ns) + + def test_validate_sleep_seconds_negative(self): + from azext_nvme_conversion._validators import validate_sleep_seconds + from azure.cli.core.azclierror import InvalidArgumentValueError + ns = MagicMock() + ns.sleep_seconds = -1 + with self.assertRaises(InvalidArgumentValueError): + validate_sleep_seconds(ns) + + def test_validate_sleep_seconds_too_large(self): + from azext_nvme_conversion._validators import validate_sleep_seconds + from azure.cli.core.azclierror import InvalidArgumentValueError + ns = MagicMock() + ns.sleep_seconds = 9999 + with self.assertRaises(InvalidArgumentValueError): + validate_sleep_seconds(ns) + + +# --------------------------------------------------------------------------- +# Helpers for mocked end-to-end tests +# --------------------------------------------------------------------------- + +def _make_vm(os_type='Linux', controller='SCSI', size='Standard_E4bds_v5', + generation='V2', publisher=None, sku=None, zones=None): + """Create a MagicMock VM with realistic attributes.""" + vm = MagicMock() + vm.name = 'testvm' + vm.location = 'eastus' + vm.hardware_profile.vm_size = size + vm.storage_profile.os_disk.os_type = os_type + vm.storage_profile.os_disk.name = 'osdisk1' + vm.storage_profile.os_disk.managed_disk.id = ( + '/subscriptions/sub1/resourceGroups/rg1/providers/Microsoft.Compute/disks/osdisk1' + ) + vm.storage_profile.disk_controller_type = controller + vm.storage_profile.image_reference.publisher = publisher or ( + 'MicrosoftWindowsServer' if os_type == 'Windows' else 'Canonical' + ) + vm.storage_profile.image_reference.sku = sku or ( + '2022-datacenter-g2' if os_type == 'Windows' else '22_04-lts-gen2' + ) + vm.zones = zones + vm.security_profile = MagicMock() + return vm + + +def _make_compute_client(vm, generation='V2', controller_types='SCSI, NVMe'): + """Create a MagicMock compute client wired up for the given VM.""" + client = MagicMock() + client.virtual_machines.get.return_value = vm + + # Instance view — running + instance_view = MagicMock() + running_status = MagicMock() + running_status.code = 'PowerState/running' + instance_view.statuses = [running_status] + client.virtual_machines.instance_view.return_value = instance_view + + # Disk — generation + disk = MagicMock() + disk.hyper_v_generation = generation + client.disks.get.return_value = disk + + # Extension — not found (no ADE) + client.virtual_machine_extensions.get.side_effect = Exception('Not found') + + # SKU list + sku_cap_resource = MagicMock() + sku_cap_resource.name = 'MaxResourceVolumeMB' + sku_cap_resource.value = '150528' + sku = MagicMock() + sku.name = vm.hardware_profile.vm_size + sku.resource_type = 'virtualMachines' + if controller_types is not None: + sku_cap_controller = MagicMock() + sku_cap_controller.name = 'DiskControllerTypes' + sku_cap_controller.value = controller_types + sku.capabilities = [sku_cap_controller, sku_cap_resource] + else: + # Simulate SKUs with no DiskControllerTypes (older SCSI-only SKUs) + sku.capabilities = [sku_cap_resource] + sku.location_info = [] + client.resource_skus.list.return_value = [sku] + + # Pollers + for method in ['begin_deallocate', 'begin_start', 'begin_create_or_update', 'begin_run_command']: + poller = MagicMock() + poller.result.return_value = MagicMock(status_code='OK', status='Succeeded', value=[]) + getattr(client.virtual_machines, method).return_value = poller + + disk_poller = MagicMock() + disk_poller.result.return_value = MagicMock() + client.disks.begin_update.return_value = disk_poller + + # Deallocated status after stop + dealloc_view = MagicMock() + dealloc_status = MagicMock() + dealloc_status.code = 'PowerState/deallocated' + dealloc_view.statuses = [dealloc_status] + # By default, return deallocated (works for ignore_os_check=True where + # instance_view is only called post-stop). Tests that need the pre-check + # running state should override instance_view.side_effect. + client.virtual_machines.instance_view.return_value = dealloc_view + + return client + + +class ResolveVmSizeTests(unittest.TestCase): + """Tests for _resolve_vm_size logic.""" + + def test_explicit_size_returned_as_is(self): + from azext_nvme_conversion.custom import _resolve_vm_size + vm = _make_vm() + client = _make_compute_client(vm) + result = _resolve_vm_size(client, vm, 'Standard_D4s_v5', 'NVMe', False) + self.assertEqual(result, 'Standard_D4s_v5') + + def test_none_size_uses_current_when_supported(self): + from azext_nvme_conversion.custom import _resolve_vm_size + vm = _make_vm(controller='SCSI', size='Standard_E4bds_v5') + client = _make_compute_client(vm, controller_types='SCSI, NVMe') + result = _resolve_vm_size(client, vm, None, 'NVMe', False) + self.assertEqual(result, 'Standard_E4bds_v5') + + def test_none_size_errors_when_not_supported(self): + from azext_nvme_conversion.custom import _resolve_vm_size + from azure.cli.core.azclierror import InvalidArgumentValueError + vm = _make_vm(controller='SCSI', size='Standard_D2s_v3') + client = _make_compute_client(vm, controller_types='SCSI') + with self.assertRaises(InvalidArgumentValueError): + _resolve_vm_size(client, vm, None, 'NVMe', False) + + def test_none_size_absent_capability_blocks_nvme(self): + """Missing DiskControllerTypes = SCSI-only → NVMe blocked.""" + from azext_nvme_conversion.custom import _resolve_vm_size + from azure.cli.core.azclierror import InvalidArgumentValueError + vm = _make_vm(controller='SCSI', size='Standard_D2s_v5') + client = _make_compute_client(vm, controller_types=None) + with self.assertRaises(InvalidArgumentValueError): + _resolve_vm_size(client, vm, None, 'NVMe', False) + + def test_none_size_absent_capability_allows_scsi(self): + """Missing DiskControllerTypes = SCSI-only → SCSI OK.""" + from azext_nvme_conversion.custom import _resolve_vm_size + vm = _make_vm(controller='NVMe', size='Standard_D2s_v5') + client = _make_compute_client(vm, controller_types=None) + result = _resolve_vm_size(client, vm, None, 'SCSI', False) + self.assertEqual(result, 'Standard_D2s_v5') + + def test_none_size_nvme_only_blocks_scsi(self): + """v6 NVMe-only SKU → SCSI blocked.""" + from azext_nvme_conversion.custom import _resolve_vm_size + from azure.cli.core.azclierror import InvalidArgumentValueError + vm = _make_vm(controller='NVMe', size='Standard_D2s_v6') + client = _make_compute_client(vm, controller_types='NVMe') + with self.assertRaises(InvalidArgumentValueError): + _resolve_vm_size(client, vm, None, 'SCSI', False) + + def test_none_size_with_ignore_sku_uses_current(self): + from azext_nvme_conversion.custom import _resolve_vm_size + vm = _make_vm(controller='SCSI', size='Standard_D2s_v3') + client = _make_compute_client(vm) + result = _resolve_vm_size(client, vm, None, 'NVMe', ignore_sku_check=True) + self.assertEqual(result, 'Standard_D2s_v3') + + +class ConvertEndToEndTests(unittest.TestCase): + """Mocked end-to-end tests for the convert command.""" + + @patch('azext_nvme_conversion._client_factory.cf_compute') + def _run_convert(self, mock_cf, vm=None, client=None, **kwargs): + """Helper to run nvme_conversion_convert with mocked client.""" + from azext_nvme_conversion.custom import nvme_conversion_convert + if vm is None: + vm = _make_vm() + if client is None: + client = _make_compute_client(vm) + mock_cf.return_value = client + cmd = MagicMock() + defaults = { + 'resource_group_name': 'rg1', + 'vm_name': 'testvm', + 'vm_size': None, + 'new_controller_type': None, + 'start_vm': False, + 'fix_os': False, + 'dry_run': False, + 'ignore_sku_check': True, + 'ignore_os_check': True, + 'ignore_windows_version_check': True, + 'sleep_seconds': 0, + 'no_wait': False, + 'yes': True, + } + defaults.update(kwargs) + return nvme_conversion_convert(cmd, **defaults), client + + def test_convert_scsi_to_nvme_succeeds(self): + vm = _make_vm(controller='SCSI') + result, client = self._run_convert(vm=vm) + self.assertEqual(result['status'], 'succeeded') + self.assertEqual(result['controllerType'], 'NVMe') + # VM mock returns deallocated, so shutdown is skipped + client.virtual_machines.begin_deallocate.assert_not_called() + client.disks.begin_update.assert_called_once() + client.virtual_machines.begin_create_or_update.assert_called_once() + + def test_convert_nvme_to_scsi_succeeds(self): + vm = _make_vm(controller='NVMe') + result, client = self._run_convert(vm=vm) + self.assertEqual(result['status'], 'succeeded') + self.assertEqual(result['controllerType'], 'SCSI') + + def test_convert_already_on_target_returns_no_change(self): + vm = _make_vm(controller='NVMe') + result, _ = self._run_convert(vm=vm, new_controller_type='NVMe') + self.assertEqual(result['status'], 'no-change') + + def test_convert_with_start_vm(self): + vm = _make_vm(controller='SCSI') + result, client = self._run_convert(vm=vm, start_vm=True) + self.assertEqual(result['vmStarted'], True) + client.virtual_machines.begin_start.assert_called_once() + + def test_convert_without_start_vm(self): + vm = _make_vm(controller='SCSI') + result, client = self._run_convert(vm=vm, start_vm=False) + self.assertEqual(result['vmStarted'], False) + client.virtual_machines.begin_start.assert_not_called() + + def test_convert_no_wait_skips_start_wait(self): + vm = _make_vm(controller='SCSI') + result, client = self._run_convert(vm=vm, start_vm=True, no_wait=True) + self.assertEqual(result['status'], 'succeeded') + # begin_start is called but result() is not called on it + client.virtual_machines.begin_start.assert_called_once() + + def test_convert_includes_revert_command_for_nvme(self): + vm = _make_vm(controller='SCSI', size='Standard_E4bds_v5') + result, _ = self._run_convert(vm=vm) + self.assertIn('revertCommand', result) + self.assertIn('--controller-type SCSI', result['revertCommand']) + self.assertIn('Standard_E4bds_v5', result['revertCommand']) + + def test_convert_scsi_to_nvme_no_revert_for_scsi_target(self): + vm = _make_vm(controller='NVMe') + result, _ = self._run_convert(vm=vm, new_controller_type='SCSI') + self.assertNotIn('revertCommand', result) + + def test_convert_dry_run_stops_before_shutdown(self): + vm = _make_vm(controller='SCSI') + result, client = self._run_convert(vm=vm, dry_run=True, ignore_os_check=False) + self.assertEqual(result['status'], 'dry-run-complete') + client.virtual_machines.begin_deallocate.assert_not_called() + client.disks.begin_update.assert_not_called() + + def test_convert_gen1_vm_blocked(self): + from azure.cli.core.azclierror import ValidationError + vm = _make_vm(controller='SCSI') + client = _make_compute_client(vm, generation='V1') + with self.assertRaises(ValidationError): + self._run_convert(vm=vm, client=client) + + def test_convert_ade_linux_blocked(self): + from azure.cli.core.azclierror import ValidationError + vm = _make_vm(os_type='Linux', controller='SCSI') + client = _make_compute_client(vm) + # Simulate ADE extension found + ext = MagicMock() + ext.provisioning_state = 'Succeeded' + client.virtual_machine_extensions.get.side_effect = None + client.virtual_machine_extensions.get.return_value = ext + with self.assertRaises(ValidationError): + self._run_convert(vm=vm, client=client) + + +class CheckEndToEndTests(unittest.TestCase): + """Mocked end-to-end tests for the check command.""" + + @patch('azext_nvme_conversion._client_factory.cf_compute') + def _run_check(self, mock_cf, vm=None, client=None, **kwargs): + from azext_nvme_conversion.custom import nvme_conversion_check + if vm is None: + vm = _make_vm() + if client is None: + client = _make_compute_client(vm) + mock_cf.return_value = client + cmd = MagicMock() + defaults = { + 'resource_group_name': 'rg1', + 'vm_name': 'testvm', + 'vm_size': None, + 'new_controller_type': None, + 'ignore_sku_check': True, + 'ignore_os_check': True, + 'ignore_windows_version_check': True, + } + defaults.update(kwargs) + return nvme_conversion_check(cmd, **defaults) + + def test_check_scsi_vm_passes(self): + vm = _make_vm(controller='SCSI') + result = self._run_check(vm=vm) + self.assertEqual(result['overallStatus'], 'passed') + self.assertEqual(result['currentControllerType'], 'SCSI') + self.assertEqual(result['targetControllerType'], 'NVMe') + + def test_check_already_on_target_returns_info(self): + vm = _make_vm(controller='NVMe') + result = self._run_check(vm=vm, new_controller_type='NVMe') + self.assertEqual(result['overallStatus'], 'passed') + self.assertEqual(result['checks']['controllerCheck']['status'], 'info') + + def test_check_gen1_fails(self): + vm = _make_vm(controller='SCSI') + client = _make_compute_client(vm, generation='V1') + result = self._run_check(vm=vm, client=client) + self.assertEqual(result['checks']['generationCheck']['status'], 'failed') + self.assertEqual(result['overallStatus'], 'failed') + + def test_check_ade_fails(self): + vm = _make_vm(os_type='Linux', controller='SCSI') + client = _make_compute_client(vm) + ext = MagicMock() + ext.provisioning_state = 'Succeeded' + client.virtual_machine_extensions.get.side_effect = None + client.virtual_machine_extensions.get.return_value = ext + result = self._run_check(vm=vm, client=client) + self.assertEqual(result['checks']['adeCheck']['status'], 'failed') + + def test_check_vm_not_found(self): + from azure.cli.core.azclierror import ResourceNotFoundError + client = MagicMock() + client.virtual_machines.get.side_effect = ResourceNotFoundError('Not found') + result = self._run_check(client=client) + self.assertEqual(result['overallStatus'], 'failed') + self.assertEqual(result['checks']['vmExists']['status'], 'failed') + + def test_check_reports_current_size(self): + vm = _make_vm(controller='SCSI', size='Standard_E8bds_v5') + result = self._run_check(vm=vm) + self.assertEqual(result['currentSize'], 'Standard_E8bds_v5') + + +if __name__ == '__main__': + unittest.main() diff --git a/src/nvme-conversion/setup.cfg b/src/nvme-conversion/setup.cfg new file mode 100755 index 00000000000..e4f123c7b9a --- /dev/null +++ b/src/nvme-conversion/setup.cfg @@ -0,0 +1 @@ +#setup.cfg diff --git a/src/nvme-conversion/setup.py b/src/nvme-conversion/setup.py new file mode 100755 index 00000000000..675f2034fcb --- /dev/null +++ b/src/nvme-conversion/setup.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python + +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +from codecs import open +from setuptools import setup, find_packages + +VERSION = '1.0.0b1' + +CLASSIFIERS = [ + 'Development Status :: 4 - Beta', + 'Intended Audience :: Developers', + 'Intended Audience :: System Administrators', + 'Programming Language :: Python', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', + 'Programming Language :: Python :: 3.13', + 'License :: OSI Approved :: MIT License', +] + +DEPENDENCIES = [ + 'azure-mgmt-compute>=30.0.0', +] + +with open('README.md', 'r', encoding='utf-8') as f: + README = f.read() +with open('HISTORY.rst', 'r', encoding='utf-8') as f: + HISTORY = f.read() + +setup( + name='nvme-conversion', + version=VERSION, + description='Support for converting VM disk controllers between SCSI and NVMe.', + long_description=README + '\n\n' + HISTORY, + long_description_content_type='text/markdown', + license='MIT', + author='Microsoft Corporation', + author_email='azpycli@microsoft.com', + url='https://github.com/Azure/azure-cli-extensions/tree/main/src/nvme-conversion', + classifiers=CLASSIFIERS, + packages=find_packages(exclude=["tests", "tests.*", "azext_nvme_conversion.tests", "azext_nvme_conversion.tests.*"]), + package_data={ + 'azext_nvme_conversion': [ + 'azext_metadata.json', + ] + }, + install_requires=DEPENDENCIES, +)