zai-org · octo-patch · Apr 6, 2026
diff --git a/inference/convert_demo.py b/inference/convert_demo.py
@@ -14,13 +14,36 @@
 
 Run the script for **image-to-video**:
     $ python convert_demo.py --prompt "the cat is running" --type "i2v" --image_path "/path/to/your/image.jpg"
+
+### Using MiniMax as the LLM provider:
+    $ MINIMAX_API_KEY=your_minimax_api_key python convert_demo.py --prompt "A girl riding a bike." --type "t2v"
+
+### Using OpenAI or any OpenAI-compatible provider:
+    $ OPENAI_API_KEY=your_openai_api_key OPENAI_BASE_URL=https://api.openai.com/v1 python convert_demo.py --prompt "A girl riding a bike." --type "t2v"
 """
 
 import argparse
+import os
 from openai import OpenAI, AzureOpenAI
 import base64
 from mimetypes import guess_type
 
+
+def _get_llm_client():
+    """
+    Return (client, model_name) based on environment variables.
+
+    Priority:
+      1. MINIMAX_API_KEY → MiniMax (OpenAI-compatible, https://api.minimax.io/v1)
+      2. OPENAI_API_KEY  → OpenAI or any OpenAI-compatible provider via OPENAI_BASE_URL
+    """
+    minimax_api_key = os.environ.get("MINIMAX_API_KEY")
+    if minimax_api_key:
+        base_url = os.environ.get("MINIMAX_BASE_URL", "https://api.minimax.io/v1")
+        client = OpenAI(api_key=minimax_api_key, base_url=base_url)
+        return client, "MiniMax-M2.7"
+    return OpenAI(), "glm-4-plus"
+
 sys_prompt_t2v = """You are part of a team of bots that creates videos. You work with an assistant bot that will draw anything you say in square brackets.
 
 For example , outputting " a beautiful morning in the woods with the sun peaking through the trees " will trigger your partner bot to output an video of a forest morning , as described. You will be prompted by people looking to create detailed , amazing videos. The way to accomplish this is to take their short prompts and make them extremely detailed and descriptive.
@@ -61,16 +84,16 @@ def image_to_url(image_path):
 
 def convert_prompt(prompt: str, retry_times: int = 3, type: str = "t2v", image_path: str = None):
     """
-    Convert a prompt to a format that can be used by the model for inference
-    """
+    Convert a prompt to a format that can be used by the model for inference.
 
-    client = OpenAI()
-    ## If you using with Azure OpenAI, please uncomment the below line and comment the above line
-    # client = AzureOpenAI(
-    #     api_key="",
-    #     api_version="",
-    #     azure_endpoint=""
-    # )
+    LLM provider is selected automatically from environment variables:
+      - MINIMAX_API_KEY → MiniMax (MiniMax-M2.7)
+      - OPENAI_API_KEY  → OpenAI or any OpenAI-compatible provider
+    """
+    client, default_model = _get_llm_client()
+    ## To use Azure OpenAI instead, replace the line above with:
+    # client = AzureOpenAI(api_key="", api_version="", azure_endpoint="")
+    # default_model = "gpt-4o"
 
     text = prompt.strip()
     for i in range(retry_times):
@@ -107,15 +130,15 @@ def convert_prompt(prompt: str, retry_times: int = 3, type: str = "t2v", image_p
                         "content": f'Create an imaginative video descriptive caption or modify an earlier caption in ENGLISH for the user input: " {text} "',
                     },
                 ],
-                model="glm-4-plus",  # glm-4-plus and gpt-4o have be tested
+                model=default_model,
                 temperature=0.01,
                 top_p=0.7,
                 stream=False,
                 max_tokens=250,
             )
         else:
             response = client.chat.completions.create(
-                model="gpt-4o",
+                model=default_model,
                 messages=[
                     {"role": "system", "content": f"{sys_prompt_i2v}"},
                     {

diff --git a/inference/gradio_composite_demo/app.py b/inference/gradio_composite_demo/app.py
@@ -1,9 +1,14 @@
 """
 THis is the main file for the gradio web demo. It uses the CogVideoX-5B model to generate videos gradio web demo.
-set environment variable OPENAI_API_KEY to use the OpenAI API to enhance the prompt.
 
-Usage:
-    OpenAI_API_KEY=your_openai_api_key OPENAI_BASE_URL=https://api.openai.com/v1 python inference/gradio_web_demo.py
+The optional prompt enhancement feature uses an LLM to refine your prompt before video generation.
+Set one of the following environment variables to enable it:
+
+  - MINIMAX_API_KEY: Use MiniMax (MiniMax-M2.7) as the LLM provider.
+      MINIMAX_API_KEY=your_minimax_api_key python inference/gradio_composite_demo/app.py
+
+  - OPENAI_API_KEY: Use OpenAI or any OpenAI-compatible provider.
+      OPENAI_API_KEY=your_openai_api_key OPENAI_BASE_URL=https://api.openai.com/v1 python inference/gradio_composite_demo/app.py
 """
 
 import math
@@ -170,10 +175,26 @@ def center_crop_resize(input_video_path, target_width=720, target_height=480):
     return temp_video_path
 
 
+def _get_llm_client():
+    """
+    Return (client, model_name) based on environment variables.
+
+    Priority:
+      1. MINIMAX_API_KEY → MiniMax (OpenAI-compatible, https://api.minimax.io/v1)
+      2. OPENAI_API_KEY  → OpenAI or any OpenAI-compatible provider via OPENAI_BASE_URL
+    """
+    minimax_api_key = os.environ.get("MINIMAX_API_KEY")
+    if minimax_api_key:
+        base_url = os.environ.get("MINIMAX_BASE_URL", "https://api.minimax.io/v1")
+        client = OpenAI(api_key=minimax_api_key, base_url=base_url)
+        return client, "MiniMax-M2.7"
+    return OpenAI(), "glm-4-plus"
+
+
 def convert_prompt(prompt: str, retry_times: int = 3) -> str:
-    if not os.environ.get("OPENAI_API_KEY"):
+    if not os.environ.get("OPENAI_API_KEY") and not os.environ.get("MINIMAX_API_KEY"):
         return prompt
-    client = OpenAI()
+    client, model = _get_llm_client()
     text = prompt.strip()
 
     for i in range(retry_times):
@@ -209,7 +230,7 @@ def convert_prompt(prompt: str, retry_times: int = 3) -> str:
                     "content": f'Create an imaginative video descriptive caption or modify an earlier caption in ENGLISH for the user input: "{text}"',
                 },
             ],
-            model="glm-4-plus",
+            model=model,
             temperature=0.01,
             top_p=0.7,
             stream=False,

diff --git a/inference/gradio_web_demo.py b/inference/gradio_web_demo.py
@@ -1,13 +1,18 @@
 """
 THis is the main file for the gradio web demo. It uses the CogVideoX-2B model to generate videos gradio web demo.
-set environment variable OPENAI_API_KEY to use the OpenAI API to enhance the prompt.
+
+The optional prompt enhancement feature uses an LLM to refine your prompt before video generation.
+Set one of the following environment variables to enable it:
+
+  - MINIMAX_API_KEY: Use MiniMax (MiniMax-M2.7) as the LLM provider.
+      MINIMAX_API_KEY=your_minimax_api_key python inference/gradio_web_demo.py
+
+  - OPENAI_API_KEY: Use OpenAI or any OpenAI-compatible provider.
+      OPENAI_API_KEY=your_openai_api_key OPENAI_BASE_URL=https://api.openai.com/v1 python inference/gradio_web_demo.py
 
 This demo only supports the text-to-video generation model.
 If you wish to use the image-to-video or video-to-video generation models,
 please use the gradio_composite_demo to implement the full GUI functionality.
-
-Usage:
-    OpenAI_API_KEY=your_openai_api_key OpenAI_BASE_URL=https://api.openai.com/v1 python inference/gradio_web_demo.py
 """
 
 import os
@@ -46,11 +51,27 @@
 """
 
 
+def _get_llm_client():
+    """
+    Return (client, model_name) based on environment variables.
+
+    Priority:
+      1. MINIMAX_API_KEY → MiniMax (OpenAI-compatible, https://api.minimax.io/v1)
+      2. OPENAI_API_KEY  → OpenAI or any OpenAI-compatible provider via OPENAI_BASE_URL
+    """
+    minimax_api_key = os.environ.get("MINIMAX_API_KEY")
+    if minimax_api_key:
+        base_url = os.environ.get("MINIMAX_BASE_URL", "https://api.minimax.io/v1")
+        client = OpenAI(api_key=minimax_api_key, base_url=base_url)
+        return client, "MiniMax-M2.7"
+    return OpenAI(), "glm-4-plus"
+
+
 def convert_prompt(prompt: str, retry_times: int = 3) -> str:
-    if not os.environ.get("OPENAI_API_KEY"):
+    if not os.environ.get("OPENAI_API_KEY") and not os.environ.get("MINIMAX_API_KEY"):
         return prompt
 
-    client = OpenAI()
+    client, model = _get_llm_client()
     text = prompt.strip()
 
     for i in range(retry_times):
@@ -86,7 +107,7 @@ def convert_prompt(prompt: str, retry_times: int = 3) -> str:
                     "content": f'Create an imaginative video descriptive caption or modify an earlier caption in ENGLISH for the user input: "{text}"',
                 },
             ],
-            model="glm-4-plus",
+            model=model,
             temperature=0.01,
             top_p=0.7,
             stream=False,