Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ logs/
.idea
output*
test*
!tests/
!tests/**
venv
**/.swp
**/*.log
Expand Down
74 changes: 63 additions & 11 deletions inference/cli_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,24 @@
To run the script, use the following command with appropriate arguments:

```bash
# Single GPU (default behavior, uses CPU offload):
$ python cli_demo.py --prompt "A girl riding a bike." --model_path THUDM/CogVideoX1.5-5b --generate_type "t2v"

# Multi-GPU with balanced device mapping:
$ python cli_demo.py --prompt "A girl riding a bike." --model_path THUDM/CogVideoX1.5-5b --generate_type "t2v" --device_map balanced

# Multi-GPU with auto device mapping:
$ python cli_demo.py --prompt "A girl riding a bike." --model_path THUDM/CogVideoX1.5-5b --generate_type "t2v" --device_map auto
```

You can change `pipe.enable_sequential_cpu_offload()` to `pipe.enable_model_cpu_offload()` to speed up inference, but this will use more GPU memory

Multi-GPU Support:
- Use `--device_map balanced` to distribute the model evenly across available GPUs (recommended for inference)
- Use `--device_map auto` for automatic device placement by accelerate
- Use `--device_map sequential` to fill GPUs sequentially (useful for uneven memory)
- Default behavior (no --device_map) uses CPU offload for single-GPU setups

Additional options are available to specify the model path, guidance scale, number of inference steps, video generation type, and output paths.

"""
Expand Down Expand Up @@ -48,6 +61,9 @@
"cogvideox-2b": (480, 720),
}

# Valid device_map options for multi-GPU support
VALID_DEVICE_MAPS = {"auto", "balanced", "sequential"}


def generate_video(
prompt: str,
Expand All @@ -66,6 +82,7 @@ def generate_video(
generate_type: str = Literal["t2v", "i2v", "v2v"], # i2v: image to video, v2v: video to video
seed: int = 42,
fps: int = 16,
device_map: Optional[str] = None,
):
"""
Generates a video based on the given prompt and saves it to the specified path.
Expand All @@ -86,11 +103,23 @@ def generate_video(
- generate_type (str): The type of video generation (e.g., 't2v', 'i2v', 'v2v').·
- seed (int): The seed for reproducibility.
- fps (int): The frames per second for the generated video.
- device_map (str): Device placement strategy for multi-GPU support. Options:
- None (default): Uses sequential CPU offload for single-GPU setups
- "balanced": Distributes model layers evenly across available GPUs (recommended)
- "auto": Automatic device placement by accelerate library
- "sequential": Fills GPUs one by one in order

Multi-GPU Usage Examples:
# Balanced distribution across GPUs (recommended):
generate_video(prompt="...", model_path="...", device_map="balanced")

# Automatic device placement:
generate_video(prompt="...", model_path="...", device_map="auto")
"""

# 1. Load the pre-trained CogVideoX pipeline with the specified precision (bfloat16).
# add device_map="balanced" in the from_pretrained function and remove the enable_model_cpu_offload()
# function to use Multi GPUs.
# When device_map is specified, the model is distributed across multiple GPUs.
# When device_map is None (default), CPU offload is enabled for single-GPU setups.

image = None
video = None
Expand All @@ -115,13 +144,25 @@ def generate_video(
)
height, width = desired_resolution

# Validate device_map if provided
if device_map is not None and device_map not in VALID_DEVICE_MAPS:
raise ValueError(
f"Invalid device_map '{device_map}'. Must be one of: {', '.join(sorted(VALID_DEVICE_MAPS))} or None"
)

# Build kwargs for from_pretrained - add device_map only when specified
load_kwargs = {"torch_dtype": dtype}
if device_map is not None:
load_kwargs["device_map"] = device_map
logging.info(f"\033[1mUsing device_map='{device_map}' for multi-GPU inference\033[0m")

if generate_type == "i2v":
pipe = CogVideoXImageToVideoPipeline.from_pretrained(model_path, torch_dtype=dtype)
pipe = CogVideoXImageToVideoPipeline.from_pretrained(model_path, **load_kwargs)
image = load_image(image=image_or_video_path)
elif generate_type == "t2v":
pipe = CogVideoXPipeline.from_pretrained(model_path, torch_dtype=dtype)
pipe = CogVideoXPipeline.from_pretrained(model_path, **load_kwargs)
else:
pipe = CogVideoXVideoToVideoPipeline.from_pretrained(model_path, torch_dtype=dtype)
pipe = CogVideoXVideoToVideoPipeline.from_pretrained(model_path, **load_kwargs)
video = load_video(image_or_video_path)

# If you're using with lora, add this code
Expand All @@ -141,13 +182,16 @@ def generate_video(
pipe.scheduler.config, timestep_spacing="trailing"
)

# 3. Enable CPU offload for the model.
# turn off if you have multiple GPUs or enough GPU memory(such as H100) and it will cost less time in inference
# and enable to("cuda")
# pipe.to("cuda")
# 3. Enable CPU offload for the model (only when not using multi-GPU device_map).
# When device_map is specified, the model is already distributed across GPUs,
# so CPU offload is not needed and would conflict with device placement.
if device_map is None:
# Single-GPU mode: use CPU offload to manage memory
# Turn off if you have multiple GPUs or enough GPU memory (such as H100)
# pipe.enable_model_cpu_offload()
pipe.enable_sequential_cpu_offload()

# pipe.enable_model_cpu_offload()
pipe.enable_sequential_cpu_offload()
# VAE optimizations work in both single and multi-GPU modes
pipe.vae.enable_slicing()
pipe.vae.enable_tiling()

Expand Down Expand Up @@ -248,6 +292,13 @@ def generate_video(
"--dtype", type=str, default="bfloat16", help="The data type for computation"
)
parser.add_argument("--seed", type=int, default=42, help="The seed for reproducibility")
parser.add_argument(
"--device_map",
type=str,
default=None,
choices=["auto", "balanced", "sequential"],
help="Device placement strategy for multi-GPU inference. Options: 'balanced' (recommended, distributes evenly), 'auto' (automatic placement), 'sequential' (fills GPUs in order). Default: None (uses CPU offload for single-GPU)",
)

args = parser.parse_args()
dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16
Expand All @@ -268,4 +319,5 @@ def generate_video(
generate_type=args.generate_type,
seed=args.seed,
fps=args.fps,
device_map=args.device_map,
)
1 change: 1 addition & 0 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Tests for CogVideoX CLI tools
Loading