adapter-hub · julian-fong · Sep 16, 2024 · Oct 16, 2024 · Oct 28, 2024 · Oct 30, 2024
diff --git a/Makefile b/Makefile
@@ -28,18 +28,29 @@ style:
 	isort $(check_dirs)
 	${MAKE} extra_style_checks
 
-# Run tests for the library
+# Library Tests
 
+# run all tests in the library
 test:
 	python -m pytest -n auto --dist=loadfile -s -v ./tests/
+	python -c "import transformers; print(transformers.__version__)"
 
+# run all tests for the adapter methods for all adapter models
 test-adapter-methods:
-	python -m pytest --ignore ./tests/models -n auto --dist=loadfile -s -v ./tests/
+	python -m pytest -n auto --dist=loadfile -s -v ./tests/test_methods/
 
+# run a subset of the adapter method tests for all adapter models
+# list of all subsets: [core, heads, embeddings, composition, prefix_tuning, prompt_tuning, reft, unipelt, compacter, bottleneck, ia3, lora, config_union]
+subset ?=
+test-adapter-method-subset:
+	@echo "Running subset $(subset)"
+	python -m pytest -n auto --dist=loadfile -s -v ./tests/test_methods/ -m $(subset)
+
+
+# run the hugginface test suite for all adapter models
 test-adapter-models:
-	python -m pytest -n auto --dist=loadfile -s -v ./tests/models
+	python -m pytest -n auto --dist=loadfile -s -v ./tests/test_models/
 
 # Run tests for examples
-
 test-examples:
 	python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/
diff --git a/conftest.py b/conftest.py
@@ -87,3 +87,8 @@ def check_output(self, want, got, optionflags):
 
 
 doctest.OutputChecker = CustomOutputChecker
+
+
+def pytest_collection_modifyitems(items):
+    # Exclude the 'test_class' group from the test collection since it's not a real test class and byproduct of the generic test class generation.
+    items[:] = [item for item in items if 'test_class' not in item.nodeid]
diff --git a/docs/overview.md b/docs/overview.md
@@ -56,6 +56,7 @@ Identifiers and configuration classes are explained in more detail in the [next
 | `prefix_tuning_flat` | `PrefixTuningConfig(flat=True)` | [Prefix Tuning](methods.html#prefix-tuning) |
 | `lora` | `LoRAConfig()` | [LoRA](methods.html#lora) |
 | `ia3` | `IA3Config()` | [IA³](methods.html#ia-3) |
+| `vera` | `VeraConfig()` | [Vera](methods.html#vera) |
 | `mam` | `MAMConfig()` | [Mix-and-Match Adapters](method_combinations.html#mix-and-match-adapters) |
 | `unipelt` | `UniPELTConfig()` | [UniPELT](method_combinations.html#unipelt) |
 | `prompt_tuning` | `PromptTuningConfig()` | [Prompt Tuning](methods.html#prompt-tuning) |

diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py
@@ -442,7 +442,7 @@ def main():
     else:
         model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code)
         n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
-        logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
+        logger.info(f"Training new model from scratch - Total size={n_params / 2**20:.2f}M params")
 
     # Convert the model into an adapter model
     adapters.init(model)

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,10 +1,21 @@
 [tool.black]
 line-length = 119
 target-version = ['py38', 'py39', 'py310']
-
-# copied from HF for testing
 [tool.pytest.ini_options]
 markers = [
+    "core: marks tests as core adapter test",
+    "composition: marks tests as composition adapter test",
+    "heads: marks tests as heads adapter test",
+    "embeddings: marks tests as embeddings adapter test",
+    "class_conversion: marks tests as class conversion adapter test",
+    "prefix_tuning: marks tests as prefix tuning adapter test",
+    "prompt_tuning: marks tests as prompt tuning adapter test",
+    "reft: marks tests as reft adapter test",
+    "unipelt: marks tests as unipelt adapter test",
+    "compacter: marks tests as compacter adapter test",
+    "bottleneck: marks tests as bottleneck adapter test",
+    "ia3: marks tests as ia3 adapter test",
+    "lora: marks tests as lora adapter test",
     "flash_attn_test: marks tests related to flash attention (deselect with '-m \"not flash_attn_test\"')",
     "bitsandbytes: select (or deselect with `not`) bitsandbytes integration tests",
     "generate: marks tests that use the GenerationTesterMixin"

diff --git a/setup.cfg b/setup.cfg
@@ -49,6 +49,9 @@ use_parentheses = True
 [flake8]
 ignore = E203, E501, E731, E741, W503, W605
 max-line-length = 119
+per-file-ignores =
+    tests/test_methods/generator.py: F401, F403, F405
+    tests/test_methods/test_*.py:F403,F405
 
 [tool:pytest]
 doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS
diff --git a/src/adapters/__init__.py b/src/adapters/__init__.py
@@ -64,6 +64,7 @@
         "SeqBnInvConfig",
         "StaticAdapterFusionConfig",
         "UniPELTConfig",
+        "VeraConfig",
     ],
     "context": [
         "AdapterSetup",
@@ -181,6 +182,7 @@
         SeqBnInvConfig,
         StaticAdapterFusionConfig,
         UniPELTConfig,
+        VeraConfig,
     )
     from .context import AdapterSetup, ForwardContext
     from .heads import (

diff --git a/src/adapters/configuration/adapter_config.py b/src/adapters/configuration/adapter_config.py
@@ -487,11 +487,20 @@ class LoRAConfig(AdapterConfig):
             (addition of decomposed matrix, as in LoRA) or "scale" (element-wise multiplication of vector, as in
             (IA)^3). "scale" can only be used together with r=1. Defaults to "add".
         init_weights (:obj:`str`, optional): Initialization method for the weights of the LoRA modules.
-            Currently, this can be either "lora" (default) or "bert".
+            Currently, this can be either "lora" (default) or "bert", or "vera".
         use_gating (:obj:`bool`, optional):
             Place a trainable gating module besides the added parameter module to control module activation. This is
             e.g. used for UniPELT. Defaults to False. Note that modules with use_gating=True cannot be merged using
             `merge_adapter()`.
+        vera_d (:obj:`float`, optional):
+            The value of d used in the VeraConfig. Defaults to None. Places a trainable
+            scaling parameter `d` before the decomposition matrix A to allow scaling of the
+            internal weights.
+
+        vera_b (:obj:`float`, optional):
+            The value of b used in the VeraConfig. Defaults to None. Places a trainable
+            scaling parameter `b` before the decomposition matrix B to allow scaling of the
+            internal weights.
         dtype (str, optional): torch dtype for reparametrization tensors. Defaults to None.
     """
 
@@ -509,6 +518,8 @@ class LoRAConfig(AdapterConfig):
     composition_mode: str = "add"
     init_weights: str = "lora"
     use_gating: bool = False
+    vera_d: float = None
+    vera_b: float = None
     dtype: Optional[str] = None
 
 
@@ -535,6 +546,29 @@ class IA3Config(LoRAConfig):
     dtype: Optional[str] = None
 
 
+@dataclass(eq=False)
+class VeraConfig(LoRAConfig):
+    """
+    Lora Config that applies vector-based random matrix adaptation. It adds
+    trainable matrices 'd' and 'b' while keeping the original LoRA matrices
+    frozen, random, and shared across layers. See more through their paper:
+    https://arxiv.org/pdf/2310.11454. Note that `r` will still be supplied
+    since we are still initializing decomposition matrices A and B.
+    The `composition_mode` parameter should also be set to `add`.
+    """
+
+    selfattn_lora: bool = True
+    intermediate_lora: bool = False
+    output_lora: bool = False
+
+    r: int = 8
+    vera_d: float = 0.1
+    vera_b: float = 0.0
+    init_weights: str = "vera"
+    composition_mode: str = "add"
+    dtype: Optional[str] = None
+
+
 @dataclass(eq=False)
 class ReftConfig(AdapterConfig):
     """
@@ -770,6 +804,7 @@ def __init__(
     "prompt_tuning": PromptTuningConfig(),
     "lora": LoRAConfig(),
     "ia3": IA3Config(),
+    "vera": VeraConfig(),
     "loreft": LoReftConfig(),
     "noreft": NoReftConfig(),
     "direft": DiReftConfig(),

diff --git a/src/adapters/methods/lora.py b/src/adapters/methods/lora.py
@@ -16,6 +16,7 @@
 
 from ..composition import Average, BatchSplit, Parallel, Stack
 from ..configuration import LoRAConfig, ModelAdaptersConfig
+from ..context import ForwardContext
 from .adapter_layer_base import AdapterLayerBase, ComposableAdapterLayerBase
 from .utils import dequantize_bnb_weight
 
@@ -37,6 +38,7 @@ def __init__(
         lora_B_shape,
         config: LoRAConfig,
         gating_heads: int = 1,
+        name: str = None,
     ):
         super().__init__()
         assert config.composition_mode == "add", "LoRA module only supports composition_mode='add'."
@@ -45,6 +47,7 @@ def __init__(
         self.composition_mode = config.composition_mode
         self.attn_matrices = config.attn_matrices
         self.use_gating = config.use_gating
+        self.name = name
         # Optional dropout
         if config.dropout > 0.0:
             self.lora_dropout = nn.Dropout(p=config.dropout)
@@ -69,6 +72,9 @@ def __init__(
         elif config.init_weights == "ia3":
             nn.init.ones_(self.lora_A)
             nn.init.ones_(self.lora_B)
+        elif config.init_weights == "vera":
+            nn.init.kaiming_uniform_(self.lora_A)
+            nn.init.kaiming_uniform_(self.lora_B)
         else:
             raise ValueError("Unknown init_weights type: {}".format(config.init_weights))
 
@@ -112,6 +118,7 @@ def __init__(
         lora_B_shape,
         config: LoRAConfig,
         gating_heads: int = 1,
+        name: str = None,
     ):
         super().__init__()
         assert config.composition_mode == "scale", "IA3 module only supports composition_mode='scale'."
@@ -122,6 +129,7 @@ def __init__(
         self.composition_mode = config.composition_mode
         self.attn_matrices = config.attn_matrices
         self.use_gating = config.use_gating
+        self.name = name
         # Optional dropout
         if config.dropout > 0.0:
             raise ValueError("IA3 module does not support dropout.")
@@ -133,7 +141,7 @@ def __init__(
         # For compatibility with LoRA, allow all init_weights types here.
         # Usually should be "ia3".
         if config.init_weights == "lora":
-            logger.warning("(IA)^3 module initialized with LoRA zeo init. Ignore if this is intended.")
+            logger.warning("(IA)^3 module initialized with LoRA zero init. Ignore if this is intended.")
             nn.init.zeros_(self.lora_B)
         elif config.init_weights == "bert":
             nn.init.normal_(self.lora_B, std=0.02)
@@ -177,6 +185,116 @@ def forward(self, hidden_states: Optional[torch.Tensor], layer_input: torch.Tens
         return hidden_states, gate
 
 
+class Vera(nn.Module):
+    def __init__(
+        self,
+        lora_A_shape,
+        lora_B_shape,
+        config: LoRAConfig,
+        gating_heads: int = 1,
+        name: str = None,
+    ):
+        super().__init__()
+        self.d = config.vera_d
+        self.b = config.vera_b
+        self.r = config.r
+        self.alpha = config.alpha
+        self.use_gating = config.use_gating
+        self.name = name
+
+        # check to make sure that the `composition_mode` is set to `add`
+        self.composition_mode = config.composition_mode
+        if self.composition_mode != "add":
+            raise ValueError("Vera module only supports composition_mode='add'.")
+
+        # Optional dropout
+        if config.dropout > 0.0:
+            self.lora_dropout = nn.Dropout(p=config.dropout)
+
+        self.lora_A_shape = lora_A_shape
+        self.lora_B_shape = lora_B_shape
+        self.d_shape = self.lora_A_shape[0]
+        self.b_shape = self.lora_B_shape[0]
+
+        # Actual trainable parameters
+        self.vera_D = nn.Parameter(torch.diag(torch.ones(self.d_shape) * self.d))
+        self.vera_B = nn.Parameter(torch.diag(torch.ones(self.b_shape) * self.b))
+        self.scaling = self.alpha / self.r
+
+        if self.use_gating:
+            self.gate = nn.Linear(lora_A_shape[-1], gating_heads)
+            nn.init.normal_(self.gate.weight, std=0.02)
+
+    @property
+    def delta_w(self) -> torch.Tensor:
+        parameters = ForwardContext.get_context().shared_parameters[self.name]
+        lora_A = parameters["lora_A"]
+        lora_B = parameters["lora_B"]
+        return self.vera_B @ lora_B @ self.vera_D @ lora_A
+
+    def com(self, weights: torch.Tensor, added: torch.Tensor, scaling=None) -> torch.Tensor:
+        """Performs the composition operation between existing and injected weights."""
+        if scaling is None:
+            scaling = self.scaling
+        return weights + added * scaling
+
+    def com_inv(self, weights: torch.Tensor, added: torch.Tensor) -> torch.Tensor:
+        """Inverts the composition operation between existing and injected weights."""
+        return weights - added * self.scaling
+
+    def forward(self, hidden_states: Optional[torch.Tensor], layer_input: torch.Tensor):
+        parameters = ForwardContext.get_context().shared_parameters[self.name]
+        lora_A = parameters["lora_A"]
+        lora_B = parameters["lora_B"]
+
+        if hidden_states is None:
+            hidden_states = layer_input
+
+        if getattr(self, "lora_dropout"):
+            hidden_states = self.lora_dropout(hidden_states)
+
+        hidden_states = hidden_states @ torch.t(self.vera_B @ lora_B @ self.vera_D @ lora_A)
+
+        if self.use_gating:
+            gate = torch.sigmoid(self.gate(layer_input))
+            gate = torch.mean(gate, dim=1).unsqueeze(-1)
+            hidden_states = hidden_states * gate
+        else:
+            gate = None
+            hidden_states = hidden_states * self.scaling
+
+        return hidden_states, gate
+
+
+def init_shared_vera_parameters(model_config, adapter_config, device):
+    hidden_size = model_config.hidden_size
+    r = adapter_config["r"]
+
+    parameters = nn.ParameterDict()
+
+    # initialize frozen, random tensors A, B
+    parameters["lora_A"] = torch.zeros(r, hidden_size).to(device)
+    parameters["lora_B"] = torch.zeros(hidden_size, r).to(device)
+
+    if adapter_config["init_weights"] == "lora":
+        # initialize A the same way as the default for nn.Linear and B to zero
+        nn.init.kaiming_uniform_(parameters["lora_A"], a=math.sqrt(5))
+        nn.init.zeros_(parameters["lora_B"])
+    elif adapter_config["init_weights"] == "bert":
+        nn.init.normal_(parameters["lora_A"], std=0.02)
+        nn.init.normal_(parameters["lora_B"], std=0.02)
+    elif adapter_config["init_weights"] == "ia3":
+        nn.init.ones_(parameters["lora_A"])
+        nn.init.ones_(parameters["lora_B"])
+    elif adapter_config["init_weights"] == "vera":
+        nn.init.kaiming_uniform_(parameters["lora_A"])
+        nn.init.kaiming_uniform_(parameters["lora_B"])
+    else:
+        raise ValueError("Unknown init_weights type: {}".format(adapter_config["init_weights"]))
+
+    return parameters
+
+
 class LoRALayer(AdapterLayerBase):
     adapter_modules_name = "loras"
 
@@ -202,6 +320,7 @@ def _get_lora_shapes(self, config: LoRAConfig):
 
     def add_adapter(self, adapter_name: str, layer_idx: int) -> bool:
         self.layer_idx = layer_idx
+
         lora_config = self.adapters_config.match(
             adapter_name,
             config_type=LoRAConfig,
@@ -210,7 +329,10 @@ def add_adapter(self, adapter_name: str, layer_idx: int) -> bool:
         )
         if lora_config is not None and self._check_lora_location(lora_config):
             if lora_config.composition_mode == "add":
-                lora_cls = LoRA
+                if isinstance(lora_config.vera_d, float) or isinstance(lora_config.vera_b, float):
+                    lora_cls = Vera
+                else:
+                    lora_cls = LoRA
             elif lora_config.composition_mode == "scale":
                 lora_cls = IA3
             else:
@@ -219,7 +341,9 @@ def add_adapter(self, adapter_name: str, layer_idx: int) -> bool:
                 *self._get_lora_shapes(lora_config),
                 lora_config,
                 gating_heads=self.get_n_heads(lora_config),
+                name=adapter_name,
             )
+
             lora.train(self.training)
             lora = lora.to(self.weight.device)
             self.loras[adapter_name] = lora