pytorch · wconstab · Sep 18, 2024 · tianyu-l · Sep 18, 2024 · wconstab
diff --git a/torchtitan/parallelisms/pipeline_llama.py b/torchtitan/parallelisms/pipeline_llama.py
@@ -104,9 +104,11 @@ def _build_stage(stage_idx, start_layer, stop_layer, is_first=False, is_last=Fal
             model.norm = None
             model.output = None
 
-        # TODO(whc) once ManualPipelineStage supports lazy shape inference, we can leave model on meta device longer and
-        # get rid of the input shape hardcoded here. For now, it should not be a big deal since we only materialize the
-        # layers of the model that map to this stage, not the whole model.
+        # Note: these tensors are only here as metadata hints, so pipelining runtime knows what size buffer to allocate.
+        # these tensors should be on meta device, adn the model should also.  It will be allocated on device after
+        # applying all other parallelisms.
+
+        # TODO(whc) once ManualPipelineStage supports lazy shape inference, we can avoid specifying input/output shapes
         mp_dtype = _mixed_precision_dtype(job_config, parallel_dims)
         batch_size = job_config.training.batch_size
         local_seq_len = int(job_config.training.seq_len // parallel_dims.tp)
@@ -117,18 +119,17 @@ def _build_stage(stage_idx, start_layer, stop_layer, is_first=False, is_last=Fal
             model_config.vocab_size,
         )
         if is_first:
-            (input,) = _llama_trace_input(job_config, model_config, device=device)
+            (input,) = _llama_trace_input(job_config, model_config, device="meta")
         else:
             # later layers (assume all start w/ a transformer layer)
-            input = torch.rand(layers_io_shape, dtype=mp_dtype, device=device)
+            input = torch.rand(layers_io_shape, dtype=mp_dtype, device="meta")
 
         if is_last:
-            output = torch.rand(output_layer_shape, dtype=torch.float32, device=device)
+            output = torch.rand(output_layer_shape, dtype=torch.float32, device="meta")
         else:
             # earlier layers (assume all end in a transformer layer)
-            output = torch.rand(layers_io_shape, dtype=mp_dtype, device=device)
+            output = torch.rand(layers_io_shape, dtype=mp_dtype, device="meta")
 
-        model.to_empty(device=device)
         stage = PipelineStage(
             model,
             stage_idx,