diff --git a/emm/data/create_data.py b/emm/data/create_data.py
index 2b9959a..481db78 100644
--- a/emm/data/create_data.py
+++ b/emm/data/create_data.py
@@ -511,5 +511,5 @@ def create_training_data() -> tuple[pd.DataFrame, Vocabulary]:
         lambda r: None if r["no_candidate"] else r["gt_uid"], axis=1
     )  # By convention, gt_uid is null in the no_candidate case
 
-    vocabulary = Vocabulary(very_common_words={"bv", "nv"}, common_words={"bank", "holding"})
+    vocabulary = Vocabulary(very_common_words={"bv", "nv", "ltd", "plc", "limited", "co", "llp"}, common_words={"bank", "holding", "holdings"})
     return df, vocabulary