modelscope-text-to-video-sy.../app.py

125 lines
4.1 KiB
Python

#!/usr/bin/env python
from __future__ import annotations
import os
import random
import tempfile
import gradio as gr
import imageio
import numpy as np
import torch
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
from gradio.themes.utils import sizes
theme = gr.themes.Default(radius_size=sizes.radius_none).set(
block_label_text_color = '#4D63FF',
block_title_text_color = '#4D63FF',
button_primary_text_color = '#4D63FF',
button_primary_background_fill='#FFFFFF',
button_primary_border_color='#4D63FF',
button_primary_background_fill_hover='#EDEFFF',
)
css = "footer {visibility: hidden}"
MAX_NUM_FRAMES = int(os.getenv('MAX_NUM_FRAMES', '200'))
DEFAULT_NUM_FRAMES = min(MAX_NUM_FRAMES,
int(os.getenv('DEFAULT_NUM_FRAMES', '16')))
pipe = DiffusionPipeline.from_pretrained('damo-vilab/text-to-video-ms-1.7b',
torch_dtype=torch.float16,
variant='fp16')
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe.enable_model_cpu_offload()
pipe.enable_vae_slicing()
def to_video(frames: list[np.ndarray], fps: int) -> str:
out_file = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False)
writer = imageio.get_writer(out_file.name, format='FFMPEG', fps=fps)
for frame in frames:
writer.append_data(frame)
writer.close()
return out_file.name
def generate(prompt: str, seed: int, num_frames: int,
num_inference_steps: int) -> str:
if seed == -1:
seed = random.randint(0, 1000000)
generator = torch.Generator().manual_seed(seed)
frames = pipe(prompt,
num_inference_steps=num_inference_steps,
num_frames=num_frames,
generator=generator).frames
return to_video(frames, 8)
examples = [
['An astronaut riding a horse.', 0, 16, 25],
['A panda eating bamboo on a rock.', 0, 16, 25],
['Spiderman is surfing.', 0, 16, 25],
]
with gr.Blocks(theme=theme, css=css) as demo:
gr.Markdown("""
<div align='center' ><font size='60'>通过文本合成视频</font></div>
""")
with gr.Group():
with gr.Box():
with gr.Row(elem_id='prompt-container').style(equal_height=True):
prompt = gr.Text(
label='Prompt',
show_label=False,
max_lines=1,
placeholder='输入提示',
elem_id='prompt-text-input').style(container=False)
run_button = gr.Button('生成视频').style(
full_width=False)
result = gr.Video(label='Result', show_label=False, elem_id='gallery')
with gr.Accordion('高级选项', open=False):
seed = gr.Slider(
label='Seed',
minimum=-1,
maximum=1000000,
step=1,
value=-1,
info='If set to -1, a different seed will be used each time.')
num_frames = gr.Slider(
label='Number of frames',
minimum=16,
maximum=MAX_NUM_FRAMES,
step=1,
value=16,
info=
'Note that the content of the video also changes when you change the number of frames.'
)
num_inference_steps = gr.Slider(label='Number of inference steps',
minimum=10,
maximum=50,
step=1,
value=25)
inputs = [
prompt,
seed,
num_frames,
num_inference_steps,
]
gr.Examples(examples=examples,
inputs=inputs,
outputs=result,
fn=generate,
cache_examples=os.getenv('SYSTEM') == 'spaces',
label="示例")
prompt.submit(fn=generate, inputs=inputs, outputs=result)
run_button.click(fn=generate, inputs=inputs, outputs=result)
demo.queue(api_open=False, max_size=15).launch(server_name="0.0.0.0")