hpcaitech · XmYx · Jun 25, 2024 · Jun 25, 2024 · Jun 25, 2024 · Jun 25, 2024
diff --git a/README.md b/README.md
@@ -299,9 +299,9 @@ Then, you can choose the **resolution**, **duration**, and **aspect ratio** of t
 
 |      | Image   | 2s       | 4s        | 8s        | 16s       |
 | ---- | ------- | -------- | --------- | --------- | --------- |
-| 360p | 3s, 24G | 18s, 27G | 31s, 27G  | 62s, 28G  | 121s, 33G |
-| 480p | 2s, 24G | 29s, 31G | 55s, 30G  | 108s, 32G | 219s, 36G |
-| 720p | 6s, 27G | 68s, 41G | 130s, 39G | 260s, 45G | 547s, 67G |
+| 360p | 3s, --G | 18s, --G | 31s, --G  | 62s, --G  | 121s, --G |
+| 480p | 2s, --G | 29s, --G | 55s, --G  | 108s, --G | 219s, 19G |
+| 720p | 6s, --G | 68s, --G | 130s, --G | 260s, --G | 547s, --G |
 
 Note that besides text to video, you can also use **image to video generation**. You can upload an image and then click the "**Generate video**" button to generate a video with the image as the first frame. Or you can fill in the text prompt and click the "**Generate image**" button to generate an image with the text prompt, and then click the "**Generate video**" button to generate a video with the image generated with the same model.
 

diff --git a/gradio/app.py b/gradio/app.py
@@ -89,11 +89,11 @@ def build_models(model_type, config, enable_optimization=False):
     # build vae
     from opensora.registry import MODELS, build_module
 
-    vae = build_module(config.vae, MODELS).cuda()
+    vae = build_module(config.vae, MODELS)#.cuda()
 
     # build text encoder
     text_encoder = build_module(config.text_encoder, MODELS)  # T5 must be fp32
-    text_encoder.t5.model = text_encoder.t5.model.cuda()
+    # text_encoder.t5.model = text_encoder.t5.model.cuda()
 
     # build stdit
     # we load model from HuggingFace directly so that we don't need to
@@ -102,7 +102,8 @@ def build_models(model_type, config, enable_optimization=False):
 
     model_kwargs = {k: v for k, v in config.model.items() if k not in ("type", "from_pretrained", "force_huggingface")}
     stdit = STDiT3.from_pretrained(HF_STDIT_MAP[model_type], **model_kwargs)
-    stdit = stdit.cuda()
+    # stdit = torch.compile(stdit, mode="reduce-overhead")
+    # stdit = stdit.cuda()
 
     # build scheduler
     from opensora.registry import SCHEDULERS
@@ -195,7 +196,7 @@ def parse_args():
 vae, text_encoder, stdit, scheduler = build_models(
     args.model_type, config, enable_optimization=args.enable_optimization
 )
-
+print(scheduler)
 
 def run_inference(
     mode,
@@ -363,8 +364,10 @@ def run_inference(
                 progress=True,
                 mask=masks,
             )
+            vae.cuda()
             samples = vae.decode(samples.to(dtype), num_frames=num_frames)
             video_clips.append(samples)
+            vae.cpu()
 
         # =========================
         # Save output

diff --git a/opensora/schedulers/rf/__init__.py b/opensora/schedulers/rf/__init__.py
@@ -49,11 +49,14 @@ def sample(
 
         n = len(prompts)
         # text encoding
+        text_encoder.t5.model = text_encoder.t5.model.cuda()
+
         model_args = text_encoder.encode(prompts)
-        y_null = text_encoder.null(n)
+        y_null = text_encoder.null(n).cuda()
         model_args["y"] = torch.cat([model_args["y"], y_null], 0)
         if additional_args is not None:
             model_args.update(additional_args)
+        text_encoder.t5.model = text_encoder.t5.model.cpu()
 
         # prepare timesteps
         timesteps = [(1.0 - i / self.num_sampling_steps) * self.num_timesteps for i in range(self.num_sampling_steps)]
@@ -66,7 +69,7 @@ def sample(
         if mask is not None:
             noise_added = torch.zeros_like(mask, dtype=torch.bool)
             noise_added = noise_added | (mask == 1)
-
+        model = model.cuda()
         progress_wrap = tqdm if progress else (lambda x: x)
         for i, t in progress_wrap(enumerate(timesteps)):
             # mask for adding noise
@@ -96,6 +99,7 @@ def sample(
 
             if mask is not None:
                 z = torch.where(mask_t_upper[:, None, :, None, None], z, x0)
+        model = model.cpu()
 
         return z