chg: Updated README.

cedricbonhomme · cedricbonhomme · commit 347b742395ea · 2025-02-25T08:12:41.000+01:00
diff --git a/README.md b/README.md
@@ -45,7 +45,7 @@ Then ensures that the kvrocks database of Vulnerability-Lookup is running.
 Creation of datasets:
 
 ```bash
-$ vulntrain-create-dataset --nb-rows 10000 --upload --repo-id CIRCL/vulnerability-dataset-10k
+$ vulntrain-dataset-generation --sources cvelistv5 --nb-rows 10000 --upload --repo-id CIRCL/vulnerability-dataset-10k
 Generating train split: 9999 examples [00:00, 177710.74 examples/s]
 DatasetDict({
     train: Dataset({
@@ -73,7 +73,7 @@ For now we are using distilbert-base-uncased (AutoModelForMaskedLM) or gpt2 (Aut
 The goal is to generate text.
 
 ```bash
-$ vulntrain-train-dataset --base-model gpt2 --model-name CIRCL/vulnerability
+$ vulntrain-train-description-generation --base-model gpt2 --dataset-id CIRCL/vulnerability --repo-id CIRCL/vulnerability-description-generation-gpt2
 Using CUDA (Nvidia GPU).
 [codecarbon WARNING @ 13:28:13] Multiple instances of codecarbon are allowed to run at the same time.
 [codecarbon INFO @ 13:28:13] [setup] RAM Tracking...
diff --git a/vulntrain/datasets/create_dataset.py b/vulntrain/datasets/create_dataset.py
@@ -151,8 +151,11 @@ def main():
     print(dataset_dict)
 
     if args.upload:
-        # dataset_dict.push_to_hub(args.repo_id, commit_message=args.commit_message, token=hf_token)
-        dataset_dict.push_to_hub(args.repo_id)
+        if args.commit_message:
+            # dataset_dict.push_to_hub(args.repo_id, commit_message=args.commit_message, token=hf_token)
+            dataset_dict.push_to_hub(args.repo_id, commit_message=args.commit_message)
+        else:
+            dataset_dict.push_to_hub(args.repo_id)
 
 
 if __name__ == "__main__":
diff --git a/vulntrain/trainers/classify.py b/vulntrain/trainers/classify.py
@@ -183,7 +183,7 @@ def main():
     parser.add_argument(
         "--model-save-dir",
         dest="model_save_dir",
-        required=True,
+        default="results",
         help="The path to a directory where the tokenizer and the model will be saved.",
     )
 
diff --git a/vulntrain/trainers/summarize.py b/vulntrain/trainers/summarize.py
@@ -130,7 +130,7 @@ def main():
     parser.add_argument(
         "--model-save-dir",
         dest="model_save_dir",
-        required=True,
+        default="results",
         help="The path to a directory where the tokenizer and the model will be saved.",
     )
 

Original file line number	Diff line number	Diff line change
`@@ -183,7 +183,7 @@ def main():`
`183`	`183`	`parser.add_argument(`
`184`	`184`	`"--model-save-dir",`
`185`	`185`	`dest="model_save_dir",`
`186`		`- required=True,`
	`186`	`+ default="results",`
`187`	`187`	`help="The path to a directory where the tokenizer and the model will be saved.",`
`188`	`188`	`)`
`189`	`189`
Original file line number	Diff line number	Diff line change
`@@ -130,7 +130,7 @@ def main():`
`130`	`130`	`parser.add_argument(`
`131`	`131`	`"--model-save-dir",`
`132`	`132`	`dest="model_save_dir",`
`133`		`- required=True,`
	`133`	`+ default="results",`
`134`	`134`	`help="The path to a directory where the tokenizer and the model will be saved.",`
`135`	`135`	`)`
`136`	`136`