Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 34 additions & 11 deletions inference/convert_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,36 @@

Run the script for **image-to-video**:
$ python convert_demo.py --prompt "the cat is running" --type "i2v" --image_path "/path/to/your/image.jpg"

### Using MiniMax as the LLM provider:
$ MINIMAX_API_KEY=your_minimax_api_key python convert_demo.py --prompt "A girl riding a bike." --type "t2v"

### Using OpenAI or any OpenAI-compatible provider:
$ OPENAI_API_KEY=your_openai_api_key OPENAI_BASE_URL=https://api.openai.com/v1 python convert_demo.py --prompt "A girl riding a bike." --type "t2v"
"""

import argparse
import os
from openai import OpenAI, AzureOpenAI
import base64
from mimetypes import guess_type


def _get_llm_client():
"""
Return (client, model_name) based on environment variables.

Priority:
1. MINIMAX_API_KEY → MiniMax (OpenAI-compatible, https://api.minimax.io/v1)
2. OPENAI_API_KEY → OpenAI or any OpenAI-compatible provider via OPENAI_BASE_URL
"""
minimax_api_key = os.environ.get("MINIMAX_API_KEY")
if minimax_api_key:
base_url = os.environ.get("MINIMAX_BASE_URL", "https://api.minimax.io/v1")
client = OpenAI(api_key=minimax_api_key, base_url=base_url)
return client, "MiniMax-M2.7"
return OpenAI(), "glm-4-plus"

sys_prompt_t2v = """You are part of a team of bots that creates videos. You work with an assistant bot that will draw anything you say in square brackets.

For example , outputting " a beautiful morning in the woods with the sun peaking through the trees " will trigger your partner bot to output an video of a forest morning , as described. You will be prompted by people looking to create detailed , amazing videos. The way to accomplish this is to take their short prompts and make them extremely detailed and descriptive.
Expand Down Expand Up @@ -61,16 +84,16 @@ def image_to_url(image_path):

def convert_prompt(prompt: str, retry_times: int = 3, type: str = "t2v", image_path: str = None):
"""
Convert a prompt to a format that can be used by the model for inference
"""
Convert a prompt to a format that can be used by the model for inference.

client = OpenAI()
## If you using with Azure OpenAI, please uncomment the below line and comment the above line
# client = AzureOpenAI(
# api_key="",
# api_version="",
# azure_endpoint=""
# )
LLM provider is selected automatically from environment variables:
- MINIMAX_API_KEY → MiniMax (MiniMax-M2.7)
- OPENAI_API_KEY → OpenAI or any OpenAI-compatible provider
"""
client, default_model = _get_llm_client()
## To use Azure OpenAI instead, replace the line above with:
# client = AzureOpenAI(api_key="", api_version="", azure_endpoint="")
# default_model = "gpt-4o"

text = prompt.strip()
for i in range(retry_times):
Expand Down Expand Up @@ -107,15 +130,15 @@ def convert_prompt(prompt: str, retry_times: int = 3, type: str = "t2v", image_p
"content": f'Create an imaginative video descriptive caption or modify an earlier caption in ENGLISH for the user input: " {text} "',
},
],
model="glm-4-plus", # glm-4-plus and gpt-4o have be tested
model=default_model,
temperature=0.01,
top_p=0.7,
stream=False,
max_tokens=250,
)
else:
response = client.chat.completions.create(
model="gpt-4o",
model=default_model,
messages=[
{"role": "system", "content": f"{sys_prompt_i2v}"},
{
Expand Down
33 changes: 27 additions & 6 deletions inference/gradio_composite_demo/app.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
"""
THis is the main file for the gradio web demo. It uses the CogVideoX-5B model to generate videos gradio web demo.
set environment variable OPENAI_API_KEY to use the OpenAI API to enhance the prompt.

Usage:
OpenAI_API_KEY=your_openai_api_key OPENAI_BASE_URL=https://api.openai.com/v1 python inference/gradio_web_demo.py
The optional prompt enhancement feature uses an LLM to refine your prompt before video generation.
Set one of the following environment variables to enable it:

- MINIMAX_API_KEY: Use MiniMax (MiniMax-M2.7) as the LLM provider.
MINIMAX_API_KEY=your_minimax_api_key python inference/gradio_composite_demo/app.py

- OPENAI_API_KEY: Use OpenAI or any OpenAI-compatible provider.
OPENAI_API_KEY=your_openai_api_key OPENAI_BASE_URL=https://api.openai.com/v1 python inference/gradio_composite_demo/app.py
"""

import math
Expand Down Expand Up @@ -170,10 +175,26 @@ def center_crop_resize(input_video_path, target_width=720, target_height=480):
return temp_video_path


def _get_llm_client():
"""
Return (client, model_name) based on environment variables.

Priority:
1. MINIMAX_API_KEY → MiniMax (OpenAI-compatible, https://api.minimax.io/v1)
2. OPENAI_API_KEY → OpenAI or any OpenAI-compatible provider via OPENAI_BASE_URL
"""
minimax_api_key = os.environ.get("MINIMAX_API_KEY")
if minimax_api_key:
base_url = os.environ.get("MINIMAX_BASE_URL", "https://api.minimax.io/v1")
client = OpenAI(api_key=minimax_api_key, base_url=base_url)
return client, "MiniMax-M2.7"
return OpenAI(), "glm-4-plus"


def convert_prompt(prompt: str, retry_times: int = 3) -> str:
if not os.environ.get("OPENAI_API_KEY"):
if not os.environ.get("OPENAI_API_KEY") and not os.environ.get("MINIMAX_API_KEY"):
return prompt
client = OpenAI()
client, model = _get_llm_client()
text = prompt.strip()

for i in range(retry_times):
Expand Down Expand Up @@ -209,7 +230,7 @@ def convert_prompt(prompt: str, retry_times: int = 3) -> str:
"content": f'Create an imaginative video descriptive caption or modify an earlier caption in ENGLISH for the user input: "{text}"',
},
],
model="glm-4-plus",
model=model,
temperature=0.01,
top_p=0.7,
stream=False,
Expand Down
35 changes: 28 additions & 7 deletions inference/gradio_web_demo.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
"""
THis is the main file for the gradio web demo. It uses the CogVideoX-2B model to generate videos gradio web demo.
set environment variable OPENAI_API_KEY to use the OpenAI API to enhance the prompt.

The optional prompt enhancement feature uses an LLM to refine your prompt before video generation.
Set one of the following environment variables to enable it:

- MINIMAX_API_KEY: Use MiniMax (MiniMax-M2.7) as the LLM provider.
MINIMAX_API_KEY=your_minimax_api_key python inference/gradio_web_demo.py

- OPENAI_API_KEY: Use OpenAI or any OpenAI-compatible provider.
OPENAI_API_KEY=your_openai_api_key OPENAI_BASE_URL=https://api.openai.com/v1 python inference/gradio_web_demo.py

This demo only supports the text-to-video generation model.
If you wish to use the image-to-video or video-to-video generation models,
please use the gradio_composite_demo to implement the full GUI functionality.

Usage:
OpenAI_API_KEY=your_openai_api_key OpenAI_BASE_URL=https://api.openai.com/v1 python inference/gradio_web_demo.py
"""

import os
Expand Down Expand Up @@ -46,11 +51,27 @@
"""


def _get_llm_client():
"""
Return (client, model_name) based on environment variables.

Priority:
1. MINIMAX_API_KEY → MiniMax (OpenAI-compatible, https://api.minimax.io/v1)
2. OPENAI_API_KEY → OpenAI or any OpenAI-compatible provider via OPENAI_BASE_URL
"""
minimax_api_key = os.environ.get("MINIMAX_API_KEY")
if minimax_api_key:
base_url = os.environ.get("MINIMAX_BASE_URL", "https://api.minimax.io/v1")
client = OpenAI(api_key=minimax_api_key, base_url=base_url)
return client, "MiniMax-M2.7"
return OpenAI(), "glm-4-plus"


def convert_prompt(prompt: str, retry_times: int = 3) -> str:
if not os.environ.get("OPENAI_API_KEY"):
if not os.environ.get("OPENAI_API_KEY") and not os.environ.get("MINIMAX_API_KEY"):
return prompt

client = OpenAI()
client, model = _get_llm_client()
text = prompt.strip()

for i in range(retry_times):
Expand Down Expand Up @@ -86,7 +107,7 @@ def convert_prompt(prompt: str, retry_times: int = 3) -> str:
"content": f'Create an imaginative video descriptive caption or modify an earlier caption in ENGLISH for the user input: "{text}"',
},
],
model="glm-4-plus",
model=model,
temperature=0.01,
top_p=0.7,
stream=False,
Expand Down
Loading