100%|█████████████████████████████████████████████████████████████████████████████████| 300/300 [00:01<00:00, 288.29it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 300/300 [00:16<00:00, 17.91it/s]
  0%|                                                                                             | 0/10 [00:02<?, ?it/s]
2024-03-19 16:17:44.901 Uncaught app exception
Traceback (most recent call last):
  File "/home/attect/anaconda3/envs/DiffSynthStudio/lib/python3.9/site-packages/streamlit/runtime/scriptrunner/", line 542, in _run_script
    exec(code, module.__dict__)
  File "/mnt/e/DiffSynth-Studio/examples/", line 94, in <module>
  File "/mnt/e/DiffSynth-Studio/diffsynth/pipelines/", line 349, in run
    output_video = self.synthesize_video(model_manager, pipe, config["pipeline"]["seed"], smoother, **config["pipeline"]["pipeline_inputs"])
  File "/mnt/e/DiffSynth-Studio/diffsynth/pipelines/", line 299, in synthesize_video
    output_video = pipe(**pipeline_inputs, smoother=smoother)
  File "/home/attect/anaconda3/envs/DiffSynthStudio/lib/python3.9/site-packages/torch/utils/", line 115, in decorate_context
    return func(*args, **kwargs)
  File "/mnt/e/DiffSynth-Studio/diffsynth/pipelines/", line 221, in __call__
    noise_pred_posi = lets_dance_with_long_video(
  File "/mnt/e/DiffSynth-Studio/diffsynth/pipelines/", line 38, in lets_dance_with_long_video
    hidden_states_batch = lets_dance(
  File "/mnt/e/DiffSynth-Studio/diffsynth/pipelines/", line 72, in lets_dance
    hidden_states, time_emb, text_emb, res_stack = block(hidden_states, time_emb, text_emb, res_stack)
  File "/home/attect/anaconda3/envs/DiffSynthStudio/lib/python3.9/site-packages/torch/nn/modules/", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/attect/anaconda3/envs/DiffSynthStudio/lib/python3.9/site-packages/torch/nn/modules/", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/mnt/e/DiffSynth-Studio/diffsynth/models/", line 222, in forward
    hidden_states =[hidden_states, res_hidden_states], dim=1)
RuntimeError: Sizes of tensors must match except in dimension 1. Expected size 24 but got size 23 for tensor number 1 in the list.


100%|██████████████████████████████████████████████████████████████████████████████████| 900/900 [00:09<00:00, 92.59it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 900/900 [01:50<00:00,  8.18it/s]


Traceback (most recent call last):
File "D:\PyCharm 2023.2.1\project\DiffSynth-Studio\examples\", line 94, in
File "D:\PyCharm 2023.2.1\project\DiffSynth-Studio\diffsynth\pipelines\", line 340, in run
model_manager, pipe = self.load_pipeline(**config["models"])
File "D:\PyCharm 2023.2.1\project\DiffSynth-Studio\diffsynth\pipelines\", line 271, in load_pipeline
File "D:\PyCharm 2023.2.1\project\DiffSynth-Studio\diffsynth\", line 177, in load_textual_inversions
for file_name in os.listdir(folder):
FileNotFoundError: [WinError 3] 系统找不到指定的路径。: 'models/textual_inversion'


IndexError: 150
File "D:\DiffSynth-Studio.glut\lib\site-packages\streamlit\runtime\scriptrunner\", line 542, in _run_script
exec(code, module.dict)
File "D:\DiffSynth-Studio\pages\", line 197, in
File "D:\DiffSynth-Studio\diffsynth\pipelines\", line 337, in run
config["pipeline"]["pipeline_inputs"] = self.add_data_to_pipeline_inputs(config["data"], config["pipeline"]["pipeline_inputs"])
File "D:\DiffSynth-Studio\diffsynth\pipelines\", line 315, in add_data_to_pipeline_inputs
pipeline_inputs["input_frames"] = self.load_video(**data["input_frames"])
File "D:\DiffSynth-Studio\diffsynth\pipelines\", line 310, in load_video
frames = [video[i] for i in range(start_frame_id, end_frame_id)]
File "D:\DiffSynth-Studio\diffsynth\pipelines\", line 310, in
frames = [video[i] for i in range(start_frame_id, end_frame_id)]
File "D:\DiffSynth-Studio\diffsynth\data\", line 121, in getitem
frame =
File "D:\DiffSynth-Studio\diffsynth\data\", line 15, in getitem
return Image.fromarray(np.array(self.reader.get_data(item))).convert("RGB")
File "D:\DiffSynth-Studio.glut\lib\site-packages\imageio\core\", line 437, in get_data
raise IndexError(index)

PermissionError: [WinError 10013] 以一种访问权限不允许的方式做了一个访问套接字的尝试。

(DiffSynthStudio) D:\下载\DiffSynth-Studio-main>python -m streamlit run
Traceback (most recent call last):
File "D:\Anaconda\envs\DiffSynthStudio\lib\", line 197, in _run_module_as_main
return _run_code(code, main_globals, None,
File "D:\Anaconda\envs\DiffSynthStudio\lib\", line 87, in run_code
exec(code, run_globals)
File "D:\Anaconda\envs\DiffSynthStudio\lib\site-packages\streamlit_main
.py", line 20, in
File "D:\Anaconda\envs\DiffSynthStudio\lib\site-packages\click\", line 1157, in call
return self.main(*args, **kwargs)
File "D:\Anaconda\envs\DiffSynthStudio\lib\site-packages\click\", line 1078, in main
rv = self.invoke(ctx)
File "D:\Anaconda\envs\DiffSynthStudio\lib\site-packages\click\", line 1688, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "D:\Anaconda\envs\DiffSynthStudio\lib\site-packages\click\", line 1434, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "D:\Anaconda\envs\DiffSynthStudio\lib\site-packages\click\", line 783, in invoke
return __callback(*args, **kwargs)
File "D:\Anaconda\envs\DiffSynthStudio\lib\site-packages\streamlit\web\", line 233, in main_run
_main_run(target, args, flag_options=kwargs)
File "D:\Anaconda\envs\DiffSynthStudio\lib\site-packages\streamlit\web\", line 269, in _main_run, is_hello, args, flag_options)
File "D:\Anaconda\envs\DiffSynthStudio\lib\site-packages\streamlit\web\", line 430, in run
File "D:\Anaconda\envs\DiffSynthStudio\lib\asyncio\", line 44, in run
return loop.run_until_complete(main)
File "D:\Anaconda\envs\DiffSynthStudio\lib\asyncio\", line 647, in run_until_complete
return future.result()
File "D:\Anaconda\envs\DiffSynthStudio\lib\site-packages\streamlit\web\", line 418, in run_server
await server.start()
File "D:\Anaconda\envs\DiffSynthStudio\lib\site-packages\streamlit\web\server\", line 262, in start
File "D:\Anaconda\envs\DiffSynthStudio\lib\site-packages\streamlit\web\server\", line 129, in start_listening
File "D:\Anaconda\envs\DiffSynthStudio\lib\site-packages\streamlit\web\server\", line 188, in start_listening_tcp_socket
http_server.listen(port, address)
File "D:\Anaconda\envs\DiffSynthStudio\lib\site-packages\tornado\", line 183, in listen
sockets = bind_sockets(
File "D:\Anaconda\envs\DiffSynthStudio\lib\site-packages\tornado\", line 162, in bind_sockets
PermissionError: [WinError 10013] 以一种访问权限不允许的方式做了一个访问套接字的尝试。


Traceback (most recent call last):
File "D:\PyCharm 2023.2.1\project\DiffSynth-Studio\examples\", line 94, in
File "D:\PyCharm 2023.2.1\project\DiffSynth-Studio\diffsynth\pipelines\", line 340, in run
model_manager, pipe = self.load_pipeline(**config["models"])
File "D:\PyCharm 2023.2.1\project\DiffSynth-Studio\diffsynth\pipelines\", line 271, in load_pipeline
File "D:\PyCharm 2023.2.1\project\DiffSynth-Studio\diffsynth\", line 177, in load_textual_inversions
for file_name in os.listdir(folder):
FileNotFoundError: [WinError 3] 系统找不到指定的路径。: 'models/textual_inversion'

Black pictures

Hello, the pictures I produce are always black. I guess it is related to half precision. How can I adjust the parameters?

Repo is difficult to set up. Would appreciate an easy, working Colab notebook

Hello. I tried installing this locally, it took around an hour, and it did not work. I tried installing it on Colab, that took another hour, and it did not work. It would be nice to have a notebook attached to this repo so we can just click "run all" and try it out. Thank you for making this repo though.

about use LCM_lora_1.5, I have a error

当我设置 Lora 为 LCM 1.5 Lora时候 ,发现这个错误:

  1. 我发现是 在转换 downsamplers时候 ,down 对应的 权重是 (320,64,3,3,),后面两个维度不是能够压缩的1 1 ,我该如何修改呢,
    lora_unet_down_blocks_0_downsamplers_0_conv.lora_down.weight:torch.Size([64, 320, 3, 3])
    lora_unet_down_blocks_0_downsamplers_0_conv.lora_up.weight:torch.Size([320, 64, 1, 1])
  2. 关于 diffusers 中 LCM Lora 实现,封装的太隐蔽,没有找到 ,请问是和项目一样的实现吗,

RuntimeError: output with shape [77, 12, 1, 1] doesn't match the broadcast shape [77, 12, 77, 77]

It's hard to make the file diffsynth/models/ run.
Please help,Compared to comfyui,i really like your code.
Thanks for your harding work.

output_paths = sample(
File "/code/./", line 148, in sample
output_video = pipe(
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/", line 27, in decorate_context
return func(*args, **kwargs)
File "/code/./diffsynth/pipelines/", line 195, in call
prompt_emb_posi = self.prompter.encode_prompt(self.text_encoder, prompt, clip_skip=clip_skip, device=self.device, positive=True).cpu()
File "/code/./diffsynth/prompts/", line 84, in encode_prompt
prompt_emb = text_encoder(input_ids, clip_skip=clip_skip)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/code/./diffsynth/models/", line 68, in forward
embeds = encoder(embeds, attn_mask=attn_mask)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/code/./diffsynth/models/", line 23, in forward
hidden_states = self.attn(hidden_states, attn_mask=attn_mask)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/code/./diffsynth/models/", line 76, in forward
return self.torch_forward(hidden_states[0], encoder_hidden_states=encoder_hidden_states, attn_mask=attn_mask)
File "/code/./diffsynth/models/", line 43, in torch_forward
hidden_states = torch.nn.functional._scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
RuntimeError: output with shape [77, 12, 1, 1] doesn't match the broadcast shape [77, 12, 77, 77]

Please help,why IndexError: 2400?

input_video = [video[i] for i in range(40*60 41*60)]

File "/code/./diffsynth/data/", line 121, in getitem
frame =
File "/code/./diffsynth/data/", line 15, in getitem
return Image.fromarray(np.array(self.reader.get_data(item))).convert("RGB")
File "/opt/conda/lib/python3.10/site-packages/imageio/core/", line 437, in get_data
raise IndexError(index)
IndexError: 2400

PyTorch在编译时没有启用Flash Attention优化


D:\AIGC\DiffSynth-Studio-main\diffsynth\models\ UserWarning: 1Torch was not compiled with flash attention. (Triggered internally at C:\cb\pytorch_1000000000000\work\aten\src\ATen\native\transformers\cuda\sdp_utils.cpp:263.)
hidden_states = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)


can not run

when running error came

(/root/autodl-tmp/DiffSynth-Studio/venv/DiffSynthStudio) root@autodl-container-28ee458c74-84d890ed:~/autodl-tmp/DiffSynth-Studio/examples# python
Traceback (most recent call last):
File "/root/autodl-tmp/DiffSynth-Studio/examples/", line 1, in
from diffsynth import ModelManager, SDVideoPipeline, ControlNetConfigUnit, VideoData, save_video
ModuleNotFoundError: No module named 'diffsynth'

大佬求助,上周用还好的,今天突然IndexError: 140

File "C:\Users\DL\anaconda3\Lib\site-packages\streamlit\runtime\scriptrunner\", line 584, in _run_script
exec(code, module.dict)
File "F:\CYQ\AI\DiffSynth-Studio\pages\", line 197, in
File "F:\CYQ\AI\DiffSynth-Studio\diffsynth\pipelines\", line 337, in run
config["pipeline"]["pipeline_inputs"] = self.add_data_to_pipeline_inputs(config["data"], config["pipeline"]["pipeline_inputs"])
File "F:\CYQ\AI\DiffSynth-Studio\diffsynth\pipelines\", line 315, in add_data_to_pipeline_inputs
pipeline_inputs["input_frames"] = self.load_video(**data["input_frames"])
File "F:\CYQ\AI\DiffSynth-Studio\diffsynth\pipelines\", line 310, in load_video
frames = [video[i] for i in range(start_frame_id, end_frame_id)]
File "F:\CYQ\AI\DiffSynth-Studio\diffsynth\pipelines\", line 310, in
frames = [video[i] for i in range(start_frame_id, end_frame_id)]
File "F:\CYQ\AI\DiffSynth-Studio\diffsynth\data\", line 121, in getitem
frame =
File "F:\CYQ\AI\DiffSynth-Studio\diffsynth\data\", line 15, in getitem
return Image.fromarray(np.array(self.reader.get_data(item))).convert("RGB")
File "C:\Users\DL\anaconda3\Lib\site-packages\imageio\core\", line 437, in get_data
raise IndexError(index)

Video Creater Error Text2Video

TypeError: diffsynth.pipelines.stable_diffusion_video.SDVideoPipelineRunner.load_video() argument after ** must be a mapping, not NoneType
File "F:\DiffSynth-Studio-main\DiffSynthStudio\lib\site-packages\streamlit\runtime\scriptrunner\", line 542, in _run_script
exec(code, module.dict)
File "F:\DiffSynth-Studio-main\pages\", line 197, in
File "F:\DiffSynth-Studio-main\diffsynth\pipelines\", line 337, in run
config["pipeline"]["pipeline_inputs"] = self.add_data_to_pipeline_inputs(config["data"], config["pipeline"]["pipeline_inputs"])
File "F:\DiffSynth-Studio-main\diffsynth\pipelines\", line 315, in add_data_to_pipeline_inputs
pipeline_inputs["input_frames"] = self.load_video(**data["input_frames"])

generate video error, model weight error?

Traceback (most recent call last):
File "/root/miniconda3/envs/DiffSynthStudio/lib/python3.9/site-packages/streamlit/runtime/scriptrunner/", line 584, in _run_script
exec(code, module.dict)
File "/root/autodl-tmp/DiffSynth-Studio/pages/", line 197, in
File "/root/autodl-tmp/DiffSynth-Studio/diffsynth/pipelines/", line 340, in run
model_manager, pipe = self.load_pipeline(**config["models"])
File "/root/autodl-tmp/DiffSynth-Studio/diffsynth/pipelines/", line 271, in load_pipeline
File "/root/autodl-tmp/DiffSynth-Studio/diffsynth/models/", line 181, in load_textual_inversions
state_dict = load_state_dict(os.path.join(folder, file_name))
File "/root/autodl-tmp/DiffSynth-Studio/diffsynth/models/", line 244, in load_state_dict
return load_state_dict_from_bin(file_path, torch_dtype=torch_dtype)
File "/root/autodl-tmp/DiffSynth-Studio/diffsynth/models/", line 258, in load_state_dict_from_bin
state_dict = torch.load(file_path, map_location="cpu")
File "/root/miniconda3/envs/DiffSynthStudio/lib/python3.9/site-packages/torch/", line 1040, in load
return _legacy_load(opened_file, map_location, pickle_module, **pickle_load_args)
File "/root/miniconda3/envs/DiffSynthStudio/lib/python3.9/site-packages/torch/", line 1258, in _legacy_load
magic_number = pickle_module.load(f, **pickle_load_args)
_pickle.UnpicklingError: invalid load key, '-'.



RuntimeError: Sizes of tensors must match except in dimension 1. Expected size 16 but got size 15 for tensor number 1 in the list.

Hi! :)
I'm really interested in the new Difftoon pipeline, but whatever input video I use I get this error

File "/home/wizard/repositories/DiffSynth-Studio/diffsynth/models/", line 222, in forward
    hidden_states =[hidden_states, res_hidden_states], dim=1)
RuntimeError: Sizes of tensors must match except in dimension 1. Expected size 16 but got size 15 for tensor number 1 in the list.

I set up the environment as indicated in the and it worked flawlessly. I have no idea what I should look for to fix this: I haven't changed anything in the settings except the input video path and its resolution.

Thank you for the help!!

Flash attention question

Hi, Great work!!
I have one question: in the paper, you said that "we adopt flash attention [6] in all attention layers, including the text encoder, UNet, VAE, ControlNet models, and motion modules". I found the xformers_forward() function in the Attention module. However, this function is never called during the whole process of "". It is very strange, since it still can generate high resolution videos. I am very confused how does this work?


torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 MiB (GPU 0; 8.00 GiB total capacity; 7.31 GiB already allocated; 0 bytes free; 7.99 GiB allowed; 7.78 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF


显卡:3070 loptop 8G

Image Creator
Model type:sd_xl_turbo_1.0_fp16.safetensors

Drawing tool:freedraw
Stroke width:50
Denoising strength:0.7

Generate image:
enable auto update

ERROR: Sizes of tensors must match except in dimension 1. Expected size 136 but got size 135 for tensor number 1 in the list --which occurs at

!!! Exception during processing !!!
Traceback (most recent call last):
File "/data/comfy-ui/", line 151, in recursive_execute
output_data, output_ui = get_output_data(obj, input_data_all)
File "/data/comfy-ui/", line 81, in get_output_data
return_values = map_node_over_list(obj, input_data_all, obj.FUNCTION, allow_interrupt=True)
File "/data/comfy-ui/", line 74, in map_node_over_list
results.append(getattr(obj, func)(**slice_dict(input_data_all, i)))
File "/data/comfy-ui/custom_nodes/comfyui-cartoon-stylization/", line 71, in stylize
DiffSynthService().stylize(video_file_path, width, height, frames, fps, output_dir, TARGET_FPS, prompt, neg_prompt,stage1_infer_steps,stage2_infer_steps)
File "/data/comfy-ui/custom_nodes/comfyui-cartoon-stylization/", line 181, in stylize
File "/data/comfy-ui/custom_nodes/comfyui-cartoon-stylization/diffsynth/pipelines/", line 358, in run
output_video = self.synthesize_video(model_manager, pipe, config["pipeline"]["seed"], smoother, **config["pipeline"]["pipeline_inputs"])
File "/data/comfy-ui/custom_nodes/comfyui-cartoon-stylization/diffsynth/pipelines/", line 304, in synthesize_video
output_video = pipe(**pipeline_inputs, smoother=smoother)
File "/data/comfy-ui/venv/lib/python3.10/site-packages/torch/utils/", line 115, in decorate_context
return func(*args, **kwargs)
File "/data/comfy-ui/custom_nodes/comfyui-cartoon-stylization/diffsynth/pipelines/", line 226, in call
noise_pred_posi = lets_dance_with_long_video(
File "/data/comfy-ui/custom_nodes/comfyui-cartoon-stylization/diffsynth/pipelines/", line 43, in lets_dance_with_long_video
hidden_states_batch = lets_dance(
File "/data/comfy-ui/custom_nodes/comfyui-cartoon-stylization/diffsynth/pipelines/", line 72, in lets_dance
hidden_states, time_emb, text_emb, res_stack = block(hidden_states, time_emb, text_emb, res_stack)
File "/data/comfy-ui/venv/lib/python3.10/site-packages/torch/nn/modules/", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/data/comfy-ui/venv/lib/python3.10/site-packages/torch/nn/modules/", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/data/comfy-ui/custom_nodes/comfyui-cartoon-stylization/diffsynth/models/", line 222, in forward
hidden_states =[hidden_states, res_hidden_states], dim=1)
RuntimeError: Sizes of tensors must match except in dimension 1. Expected size 136 but got size 135 for tensor number 1 in the list.



Explain sd_text_to_video numframes and fps

numframes default is 64
fps default is 120

Shouldn't this make a video lasting approx half a second? But it makes a 4 second video?

If I wanted a 10 second video at 30 fps, what would I set the numframes and fps to?

