pre-commit

EleutherAI · Sep 30, 2022 · 8ab949d · 8ab949d
1 parent 8d80aa7
commit 8ab949d
Show file tree

Hide file tree

Showing 6 changed files with 68 additions and 31 deletions.
diff --git a/oslo/torch/nn/parallel/pipeline_parallel/_buffers.py b/oslo/torch/nn/parallel/pipeline_parallel/_buffers.py
@@ -1,4 +1,6 @@
-from oslo.torch.nn.parallel.pipeline_parallel._sync import register_location_for_forward_counter
+from oslo.torch.nn.parallel.pipeline_parallel._sync import (
+    register_location_for_forward_counter,
+)
 
 
 # original forward dictionary
@@ -31,4 +33,4 @@ def save_activation(key, activation):
 
 
 def pop_activation(key):
-    return _ACTIVATIONS.pop(key, [])    # TODO; okay?
+    return _ACTIVATIONS.pop(key, [])  # TODO; okay?
diff --git a/oslo/torch/nn/parallel/pipeline_parallel/_functional.py b/oslo/torch/nn/parallel/pipeline_parallel/_functional.py
@@ -11,12 +11,18 @@
     register_job_requires_backward,
     notify_backward_job_done,
 )
-from oslo.torch.nn.parallel.pipeline_parallel._messages import pack_tensor_stub, unpack_tensor_stub
+from oslo.torch.nn.parallel.pipeline_parallel._messages import (
+    pack_tensor_stub,
+    unpack_tensor_stub,
+)
 
 
 def remote_module_forward(
-    caller, location, unique_key,
-    args_stub, kwargs_stub,
+    caller,
+    location,
+    unique_key,
+    args_stub,
+    kwargs_stub,
     requires_redirection,
     is_training,
     is_grad_enabled,
@@ -37,7 +43,9 @@ def remote_module_forward(
         result = forward_fn(*args, **kwargs)
 
     result_stub, tensors = pack_tensor_stub(result, [])
-    need_activation_save = any([t.requires_grad for t in tensors]) and is_training and is_grad_enabled
+    need_activation_save = (
+        any([t.requires_grad for t in tensors]) and is_training and is_grad_enabled
+    )
     if need_activation_save:
         save_activation(unique_key, tensors)
 

diff --git a/oslo/torch/nn/parallel/pipeline_parallel/_messages.py b/oslo/torch/nn/parallel/pipeline_parallel/_messages.py
@@ -3,7 +3,9 @@
 import torch
 
 from oslo.torch.nn.parallel.pipeline_parallel._utils import (
-    _is_namedtuple, _is_private, _is_primitive
+    _is_namedtuple,
+    _is_private,
+    _is_primitive,
 )
 
 
@@ -30,7 +32,7 @@ def pack_tensor_stub(obj, args_list):
         for i in range(len(obj_list)):
             obj_list_i, args_list = pack_tensor_stub(obj_list[i], args_list)
             obj_list_i[i] = obj_list_i
-        obj = obj.__class__._make(obj_list)     # use namedtuple's method
+        obj = obj.__class__._make(obj_list)  # use namedtuple's method
 
         return obj, args_list
 
@@ -60,9 +62,10 @@ def pack_tensor_stub(obj, args_list):
     elif _is_primitive(obj):
         return obj, args_list
 
-    else:   # other kinds of object
+    else:  # other kinds of object
         members = [
-            attr for attr in dir(obj)
+            attr
+            for attr in dir(obj)
             if not callable(getattr(obj, attr)) and not _is_private(attr)
         ]
         for m in members:
@@ -120,7 +123,8 @@ def unpack_tensor_stub(obj, args_list):
 
     else:  # other kinds of object
         members = [
-            attr for attr in dir(obj)
+            attr
+            for attr in dir(obj)
             if not callable(getattr(obj, attr)) and not _is_private(attr)
         ]
         for m in members:

diff --git a/oslo/torch/nn/parallel/pipeline_parallel/_utils.py b/oslo/torch/nn/parallel/pipeline_parallel/_utils.py
@@ -36,8 +36,8 @@ def _is_namedtuple(obj):
 
 
 def _is_primitive(obj):
-    return not hasattr(obj, '__dict__')
+    return not hasattr(obj, "__dict__")
 
 
 def _is_private(attr):
-    return attr.startswith('__')
+    return attr.startswith("__")
diff --git a/oslo/torch/nn/parallel/pipeline_parallel/pipeline_parallel.py b/oslo/torch/nn/parallel/pipeline_parallel/pipeline_parallel.py
@@ -14,15 +14,21 @@
     get_module_device_location,
     save_activation,
 )
-from oslo.torch.nn.parallel.pipeline_parallel._functional import remote_module_forward, apply_backward_redirection
+from oslo.torch.nn.parallel.pipeline_parallel._functional import (
+    remote_module_forward,
+    apply_backward_redirection,
+)
 from oslo.torch.nn.parallel.pipeline_parallel._sync import (
     wait_other_ranks,
     make_unique_key,
     reset_forward_used_counter,
     set_result,
     get_result,
 )
-from oslo.torch.nn.parallel.pipeline_parallel._messages import pack_tensor_stub, unpack_tensor_stub
+from oslo.torch.nn.parallel.pipeline_parallel._messages import (
+    pack_tensor_stub,
+    unpack_tensor_stub,
+)
 from oslo.torch.nn.parallel.pipeline_parallel._model_partitioner import ModelPartitioner
 
 
@@ -37,7 +43,7 @@ def PipelineParallel(
         module=module,
         parallel_context=parallel_context,
         memory_computation_balance=memory_computation_balance,
-        num_micro_batches=num_micro_batches
+        num_micro_batches=num_micro_batches,
     )
 
 
@@ -139,7 +145,9 @@ def forward(self, *args, **kwargs):
 
             is_grad_enabled = torch.is_grad_enabled()
             for ind, (args_, kwargs_) in enumerate(zip(new_args, new_kwargs)):
-                future = self.producer.submit(launch, self.module, is_grad_enabled, *args_, **kwargs_)
+                future = self.producer.submit(
+                    launch, self.module, is_grad_enabled, *args_, **kwargs_
+                )
                 futures.append(future)
 
             for i, done in enumerate(concurrent.futures.as_completed(futures)):
@@ -234,12 +242,16 @@ def new_forward(*args, **kwargs):
                     to=callee,
                     func=remote_module_forward,
                     args=(
-                            caller, location, unique_key,
-                            args_stub, kwargs_stub,
-                            need_activation_save,
-                            is_training,
-                            is_grad_enabled,
-                         ) + tensors,
+                        caller,
+                        location,
+                        unique_key,
+                        args_stub,
+                        kwargs_stub,
+                        need_activation_save,
+                        is_training,
+                        is_grad_enabled,
+                    )
+                    + tensors,
                 )
                 # receive result as stub
                 result_stub, tensors, requires_redirection = fut.wait()

diff --git a/tests/torch/nn/parallel/pipeline_parallel/test_pp.py b/tests/torch/nn/parallel/pipeline_parallel/test_pp.py
@@ -15,9 +15,12 @@
 from datasets import load_dataset
 from transformers import (
     AutoTokenizer,
-    GPT2Config, GPT2LMHeadModel,
-    T5Config, T5ForConditionalGeneration,
-    BartConfig, BartForConditionalGeneration,
+    GPT2Config,
+    GPT2LMHeadModel,
+    T5Config,
+    T5ForConditionalGeneration,
+    BartConfig,
+    BartForConditionalGeneration,
     set_seed,
 )
 
@@ -56,7 +59,9 @@ def forward(
     ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
 
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
 
         # Encode if needed (training, first prediction pass)
         if encoder_outputs is None:
@@ -82,7 +87,11 @@ def forward(
         if self.model_parallel:
             torch.cuda.set_device(self.decoder.first_device)
 
-        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
+        if (
+            labels is not None
+            and decoder_input_ids is None
+            and decoder_inputs_embeds is None
+        ):
             # get decoder inputs from shifting lm labels to the right
             decoder_input_ids = self._shift_right(labels)
 
@@ -95,7 +104,9 @@ def forward(
             if attention_mask is not None:
                 attention_mask = attention_mask.to(self.decoder.first_device)
             if decoder_attention_mask is not None:
-                decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)
+                decoder_attention_mask = decoder_attention_mask.to(
+                    self.decoder.first_device
+                )
 
         # Decode
         decoder_outputs = self.decoder(
@@ -170,7 +181,7 @@ def forward(
 
 model_name = "t5-small"
 config = T5Config.from_pretrained(model_name)
-config.dropout_rate = 0.
+config.dropout_rate = 0.0
 model = T5ForConditionalGeneration(config)
 # model = T5Debug(config)
 
@@ -215,7 +226,7 @@ def forward(
 #
 if torch.distributed.get_rank() == 1:
     for k, v in _MODULE_DEVICE_LOCATIONS.items():
-        print(f'{k}: {v}')
+        print(f"{k}: {v}")
 
 
 def run():