RafaelGodoyEbert · kroll-j · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026 · Mar 31, 2026
diff --git a/main_improved.py b/main_improved.py
@@ -138,6 +138,7 @@ def main():
     parser.add_argument("--video-quality", choices=["best", "1080p", "720p", "480p"], default="best", help="Video download quality")
     parser.add_argument("--skip-youtube-subs", action="store_true", help="Skip downloading YouTube subtitles")
     parser.add_argument("--translate-target", help="Target language code for subtitle translation (e.g. 'pt', 'en').")
+    parser.add_argument("--llama-args", help="override any llama_cpp.llama.LLama() constructor args, json, e.g.: '{\"n_gpu_layers\": 40, \"n_ctx\": 8192, \"kv_overrides\": { \"kv_unified\": true }, \"flash_attn\": true }' -- see https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__init__", default='{"n_gpu_layers": -1, "n_ctx": 8192, "verbose": false }')
 
     args = parser.parse_args()
 
@@ -477,7 +478,8 @@ def main():
                         api_key=api_key,
                         project_folder=project_folder,
                         chunk_size_arg=args.chunk_size,
-                        model_name_arg=args.ai_model_name
+                        model_name_arg=args.ai_model_name,
+                        llama_args=json.loads(args.llama_args)
                     )
 
                 if not viral_segments or not viral_segments.get("segments"):

diff --git a/scripts/create_viral_segments.py b/scripts/create_viral_segments.py
@@ -498,7 +498,7 @@ def process_segments(raw_segments, transcript_segments, min_duration, max_durati
     return final_result
 
 
-def create(num_segments, viral_mode, themes, tempo_minimo, tempo_maximo, ai_mode="manual", api_key=None, project_folder="tmp", chunk_size_arg=None, model_name_arg=None):
+def create(num_segments, viral_mode, themes, tempo_minimo, tempo_maximo, ai_mode="manual", api_key=None, project_folder="tmp", chunk_size_arg=None, model_name_arg=None, llama_args={}):
     quantidade_de_virals = num_segments
 
     # 1. Load Transcript
@@ -687,12 +687,13 @@ def create(num_segments, viral_mode, themes, tempo_minimo, tempo_maximo, ai_mode
 
         print(f"[INFO] Loading Local Model: {os.path.basename(model_path)} (This may take a while)...")
         try:
-            local_llm_instance = Llama(
-                model_path=model_path,
-                n_gpu_layers=-1, 
-                n_ctx=8192,
-                verbose=False
-            )
+            default_llama_args= {
+                "model_path": model_path,
+                "n_gpu_layers": -1, 
+                "n_ctx": 8192,
+                "verbose": False
+            }
+            local_llm_instance = Llama(**(default_llama_args | llama_args))
         except Exception as e:
             print(f"Failed to load model: {e}")
             return {"segments": []}
@@ -788,4 +789,4 @@ def create(num_segments, viral_mode, themes, tempo_minimo, tempo_maximo, ai_mode
         tempo_minimo, 
         tempo_maximo, 
         output_count=quantidade_de_virals
-    )
+    )