Skip to content

Commit 21d894d

Browse files
authored
refactor: adopt token instead of use_auth_token in HF components (deepset-ai#6040)
* move embedding backends * use token in Sentence Transformers embeddings * more compact token handling * token parameter in reader * add token to ranker * release note * add test for reader
1 parent 4e4af99 commit 21d894d

File tree

9 files changed

+153
-38
lines changed

9 files changed

+153
-38
lines changed

haystack/preview/components/embedders/sentence_transformers_document_embedder.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def __init__(
1717
self,
1818
model_name_or_path: str = "sentence-transformers/all-mpnet-base-v2",
1919
device: Optional[str] = None,
20-
use_auth_token: Union[bool, str, None] = None,
20+
token: Union[bool, str, None] = None,
2121
prefix: str = "",
2222
suffix: str = "",
2323
batch_size: int = 32,
@@ -33,7 +33,7 @@ def __init__(
3333
such as ``'sentence-transformers/all-mpnet-base-v2'``.
3434
:param device: Device (like 'cuda' / 'cpu') that should be used for computation.
3535
Defaults to CPU.
36-
:param use_auth_token: The API token used to download private models from Hugging Face.
36+
:param token: The API token used to download private models from Hugging Face.
3737
If this parameter is set to `True`, then the token generated when running
3838
`transformers-cli login` (stored in ~/.huggingface) will be used.
3939
:param prefix: A string to add to the beginning of each Document text before embedding.
@@ -48,7 +48,7 @@ def __init__(
4848
self.model_name_or_path = model_name_or_path
4949
# TODO: remove device parameter and use Haystack's device management once migrated
5050
self.device = device or "cpu"
51-
self.use_auth_token = use_auth_token
51+
self.token = token
5252
self.prefix = prefix
5353
self.suffix = suffix
5454
self.batch_size = batch_size
@@ -71,7 +71,7 @@ def to_dict(self) -> Dict[str, Any]:
7171
self,
7272
model_name_or_path=self.model_name_or_path,
7373
device=self.device,
74-
use_auth_token=self.use_auth_token,
74+
token=self.token if not isinstance(self.token, str) else None, # don't serialize valid tokens
7575
prefix=self.prefix,
7676
suffix=self.suffix,
7777
batch_size=self.batch_size,
@@ -94,7 +94,7 @@ def warm_up(self):
9494
"""
9595
if not hasattr(self, "embedding_backend"):
9696
self.embedding_backend = _SentenceTransformersEmbeddingBackendFactory.get_embedding_backend(
97-
model_name_or_path=self.model_name_or_path, device=self.device, use_auth_token=self.use_auth_token
97+
model_name_or_path=self.model_name_or_path, device=self.device, use_auth_token=self.token
9898
)
9999

100100
@component.output_types(documents=List[Document])

haystack/preview/components/embedders/sentence_transformers_text_embedder.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ def __init__(
1616
self,
1717
model_name_or_path: str = "sentence-transformers/all-mpnet-base-v2",
1818
device: Optional[str] = None,
19-
use_auth_token: Union[bool, str, None] = None,
19+
token: Union[bool, str, None] = None,
2020
prefix: str = "",
2121
suffix: str = "",
2222
batch_size: int = 32,
@@ -30,7 +30,7 @@ def __init__(
3030
such as ``'sentence-transformers/all-mpnet-base-v2'``.
3131
:param device: Device (like 'cuda' / 'cpu') that should be used for computation.
3232
Defaults to CPU.
33-
:param use_auth_token: The API token used to download private models from Hugging Face.
33+
:param token: The API token used to download private models from Hugging Face.
3434
If this parameter is set to `True`, then the token generated when running
3535
`transformers-cli login` (stored in ~/.huggingface) will be used.
3636
:param prefix: A string to add to the beginning of each text.
@@ -43,7 +43,7 @@ def __init__(
4343
self.model_name_or_path = model_name_or_path
4444
# TODO: remove device parameter and use Haystack's device management once migrated
4545
self.device = device or "cpu"
46-
self.use_auth_token = use_auth_token
46+
self.token = token
4747
self.prefix = prefix
4848
self.suffix = suffix
4949
self.batch_size = batch_size
@@ -64,7 +64,7 @@ def to_dict(self) -> Dict[str, Any]:
6464
self,
6565
model_name_or_path=self.model_name_or_path,
6666
device=self.device,
67-
use_auth_token=self.use_auth_token,
67+
token=self.token if not isinstance(self.token, str) else None, # don't serialize valid tokens
6868
prefix=self.prefix,
6969
suffix=self.suffix,
7070
batch_size=self.batch_size,
@@ -85,7 +85,7 @@ def warm_up(self):
8585
"""
8686
if not hasattr(self, "embedding_backend"):
8787
self.embedding_backend = _SentenceTransformersEmbeddingBackendFactory.get_embedding_backend(
88-
model_name_or_path=self.model_name_or_path, device=self.device, use_auth_token=self.use_auth_token
88+
model_name_or_path=self.model_name_or_path, device=self.device, use_auth_token=self.token
8989
)
9090

9191
@component.output_types(embedding=List[float])

haystack/preview/components/rankers/similarity.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,15 +36,19 @@ class SimilarityRanker:
3636
def __init__(
3737
self,
3838
model_name_or_path: Union[str, Path] = "cross-encoder/ms-marco-MiniLM-L-6-v2",
39-
top_k: int = 10,
4039
device: str = "cpu",
40+
token: Union[bool, str, None] = None,
41+
top_k: int = 10,
4142
):
4243
"""
4344
Creates an instance of SimilarityRanker.
4445
4546
:param model_name_or_path: Path to a pre-trained sentence-transformers model.
46-
:param top_k: The maximum number of documents to return per query.
4747
:param device: torch device (for example, cuda:0, cpu, mps) to limit model inference to a specific device.
48+
:param token: The API token used to download private models from Hugging Face.
49+
If this parameter is set to `True`, then the token generated when running
50+
`transformers-cli login` (stored in ~/.huggingface) will be used.
51+
:param top_k: The maximum number of documents to return per query.
4852
"""
4953
torch_and_transformers_import.check()
5054

@@ -53,6 +57,7 @@ def __init__(
5357
raise ValueError(f"top_k must be > 0, but got {top_k}")
5458
self.top_k = top_k
5559
self.device = device
60+
self.token = token
5661
self.model = None
5762
self.tokenizer = None
5863

@@ -67,16 +72,22 @@ def warm_up(self):
6772
Warm up the model and tokenizer used in scoring the documents.
6873
"""
6974
if self.model_name_or_path and not self.model:
70-
self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name_or_path)
75+
self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name_or_path, token=self.token)
7176
self.model = self.model.to(self.device)
7277
self.model.eval()
73-
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
78+
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, token=self.token)
7479

7580
def to_dict(self) -> Dict[str, Any]:
7681
"""
7782
Serialize this component to a dictionary.
7883
"""
79-
return default_to_dict(self, top_k=self.top_k, device=self.device, model_name_or_path=self.model_name_or_path)
84+
return default_to_dict(
85+
self,
86+
device=self.device,
87+
model_name_or_path=self.model_name_or_path,
88+
token=self.token if not isinstance(self.token, str) else None, # don't serialize valid tokens
89+
top_k=self.top_k,
90+
)
8091

8192
@classmethod
8293
def from_dict(cls, data: Dict[str, Any]) -> "SimilarityRanker":

haystack/preview/components/readers/extractive.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ def __init__(
2525
self,
2626
model_name_or_path: Union[Path, str] = "deepset/roberta-base-squad2-distilled",
2727
device: Optional[str] = None,
28+
token: Union[bool, str, None] = None,
2829
top_k: int = 20,
2930
confidence_threshold: Optional[float] = None,
3031
max_seq_length: int = 384,
@@ -40,6 +41,9 @@ def __init__(
4041
Can either be a path to a folder containing the model files or an identifier for the HF hub
4142
Default: `'deepset/roberta-base-squad2-distilled'`
4243
:param device: Pytorch device string. Uses GPU by default if available
44+
:param token: The API token used to download private models from Hugging Face.
45+
If this parameter is set to `True`, then the token generated when running
46+
`transformers-cli login` (stored in ~/.huggingface) will be used.
4347
:param top_k: Number of answers to return per query.
4448
It is required even if confidence_threshold is set. Defaults to 20.
4549
:param confidence_threshold: Answers with a confidence score below this value will not be returned
@@ -58,6 +62,7 @@ def __init__(
5862
self.model_name_or_path = str(model_name_or_path)
5963
self.model = None
6064
self.device = device
65+
self.token = token
6166
self.max_seq_length = max_seq_length
6267
self.top_k = top_k
6368
self.confidence_threshold = confidence_threshold
@@ -81,6 +86,7 @@ def to_dict(self) -> Dict[str, Any]:
8186
self,
8287
model_name_or_path=self.model_name_or_path,
8388
device=self.device,
89+
token=self.token if not isinstance(self.token, str) else None,
8490
max_seq_length=self.max_seq_length,
8591
top_k=self.top_k,
8692
confidence_threshold=self.confidence_threshold,
@@ -104,8 +110,10 @@ def warm_up(self):
104110
self.device = self.device or "cuda:0"
105111
else:
106112
self.device = self.device or "cpu:0"
107-
self.model = AutoModelForQuestionAnswering.from_pretrained(self.model_name_or_path).to(self.device)
108-
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
113+
self.model = AutoModelForQuestionAnswering.from_pretrained(self.model_name_or_path, token=self.token).to(
114+
self.device
115+
)
116+
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, token=self.token)
109117

110118
def _flatten_documents(
111119
self, queries: List[str], documents: List[List[Document]]
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
---
2+
preview:
3+
- |
4+
Adopt Hugging Face `token` instead of the deprecated `use_auth_token`.
5+
Add this parameter to `ExtractiveReader` and `SimilarityRanker` to allow
6+
loading private models.
7+
Proper handling of `token` during serialization: if it is a string (a possible valid token)
8+
it is not serialized.

test/preview/components/embedders/test_sentence_transformers_document_embedder.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ def test_init_default(self):
1414
embedder = SentenceTransformersDocumentEmbedder(model_name_or_path="model")
1515
assert embedder.model_name_or_path == "model"
1616
assert embedder.device == "cpu"
17-
assert embedder.use_auth_token is None
17+
assert embedder.token is None
1818
assert embedder.prefix == ""
1919
assert embedder.suffix == ""
2020
assert embedder.batch_size == 32
@@ -28,7 +28,7 @@ def test_init_with_parameters(self):
2828
embedder = SentenceTransformersDocumentEmbedder(
2929
model_name_or_path="model",
3030
device="cuda",
31-
use_auth_token=True,
31+
token=True,
3232
prefix="prefix",
3333
suffix="suffix",
3434
batch_size=64,
@@ -39,7 +39,7 @@ def test_init_with_parameters(self):
3939
)
4040
assert embedder.model_name_or_path == "model"
4141
assert embedder.device == "cuda"
42-
assert embedder.use_auth_token is True
42+
assert embedder.token is True
4343
assert embedder.prefix == "prefix"
4444
assert embedder.suffix == "suffix"
4545
assert embedder.batch_size == 64
@@ -57,7 +57,7 @@ def test_to_dict(self):
5757
"init_parameters": {
5858
"model_name_or_path": "model",
5959
"device": "cpu",
60-
"use_auth_token": None,
60+
"token": None,
6161
"prefix": "",
6262
"suffix": "",
6363
"batch_size": 32,
@@ -73,7 +73,7 @@ def test_to_dict_with_custom_init_parameters(self):
7373
component = SentenceTransformersDocumentEmbedder(
7474
model_name_or_path="model",
7575
device="cuda",
76-
use_auth_token="the-token",
76+
token="the-token",
7777
prefix="prefix",
7878
suffix="suffix",
7979
batch_size=64,
@@ -83,12 +83,13 @@ def test_to_dict_with_custom_init_parameters(self):
8383
embedding_separator=" - ",
8484
)
8585
data = component.to_dict()
86+
8687
assert data == {
8788
"type": "SentenceTransformersDocumentEmbedder",
8889
"init_parameters": {
8990
"model_name_or_path": "model",
9091
"device": "cuda",
91-
"use_auth_token": "the-token",
92+
"token": None, # the token is not serialized
9293
"prefix": "prefix",
9394
"suffix": "suffix",
9495
"batch_size": 64,
@@ -106,7 +107,7 @@ def test_from_dict(self):
106107
"init_parameters": {
107108
"model_name_or_path": "model",
108109
"device": "cuda",
109-
"use_auth_token": "the-token",
110+
"token": None,
110111
"prefix": "prefix",
111112
"suffix": "suffix",
112113
"batch_size": 64,
@@ -119,7 +120,7 @@ def test_from_dict(self):
119120
component = SentenceTransformersDocumentEmbedder.from_dict(data)
120121
assert component.model_name_or_path == "model"
121122
assert component.device == "cuda"
122-
assert component.use_auth_token == "the-token"
123+
assert component.token is None
123124
assert component.prefix == "prefix"
124125
assert component.suffix == "suffix"
125126
assert component.batch_size == 64

test/preview/components/embedders/test_sentence_transformers_text_embedder.py

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ def test_init_default(self):
1212
embedder = SentenceTransformersTextEmbedder(model_name_or_path="model")
1313
assert embedder.model_name_or_path == "model"
1414
assert embedder.device == "cpu"
15-
assert embedder.use_auth_token is None
15+
assert embedder.token is None
1616
assert embedder.prefix == ""
1717
assert embedder.suffix == ""
1818
assert embedder.batch_size == 32
@@ -24,7 +24,7 @@ def test_init_with_parameters(self):
2424
embedder = SentenceTransformersTextEmbedder(
2525
model_name_or_path="model",
2626
device="cuda",
27-
use_auth_token=True,
27+
token=True,
2828
prefix="prefix",
2929
suffix="suffix",
3030
batch_size=64,
@@ -33,7 +33,7 @@ def test_init_with_parameters(self):
3333
)
3434
assert embedder.model_name_or_path == "model"
3535
assert embedder.device == "cuda"
36-
assert embedder.use_auth_token is True
36+
assert embedder.token is True
3737
assert embedder.prefix == "prefix"
3838
assert embedder.suffix == "suffix"
3939
assert embedder.batch_size == 64
@@ -49,7 +49,7 @@ def test_to_dict(self):
4949
"init_parameters": {
5050
"model_name_or_path": "model",
5151
"device": "cpu",
52-
"use_auth_token": None,
52+
"token": None,
5353
"prefix": "",
5454
"suffix": "",
5555
"batch_size": 32,
@@ -63,7 +63,7 @@ def test_to_dict_with_custom_init_parameters(self):
6363
component = SentenceTransformersTextEmbedder(
6464
model_name_or_path="model",
6565
device="cuda",
66-
use_auth_token=True,
66+
token=True,
6767
prefix="prefix",
6868
suffix="suffix",
6969
batch_size=64,
@@ -76,7 +76,7 @@ def test_to_dict_with_custom_init_parameters(self):
7676
"init_parameters": {
7777
"model_name_or_path": "model",
7878
"device": "cuda",
79-
"use_auth_token": True,
79+
"token": True,
8080
"prefix": "prefix",
8181
"suffix": "suffix",
8282
"batch_size": 64,
@@ -85,14 +85,32 @@ def test_to_dict_with_custom_init_parameters(self):
8585
},
8686
}
8787

88+
@pytest.mark.unit
89+
def test_to_dict_not_serialize_token(self):
90+
component = SentenceTransformersTextEmbedder(model_name_or_path="model", token="awesome-token")
91+
data = component.to_dict()
92+
assert data == {
93+
"type": "SentenceTransformersTextEmbedder",
94+
"init_parameters": {
95+
"model_name_or_path": "model",
96+
"device": "cpu",
97+
"token": None,
98+
"prefix": "",
99+
"suffix": "",
100+
"batch_size": 32,
101+
"progress_bar": True,
102+
"normalize_embeddings": False,
103+
},
104+
}
105+
88106
@pytest.mark.unit
89107
def test_from_dict(self):
90108
data = {
91109
"type": "SentenceTransformersTextEmbedder",
92110
"init_parameters": {
93111
"model_name_or_path": "model",
94112
"device": "cuda",
95-
"use_auth_token": True,
113+
"token": True,
96114
"prefix": "prefix",
97115
"suffix": "suffix",
98116
"batch_size": 64,
@@ -103,7 +121,7 @@ def test_from_dict(self):
103121
component = SentenceTransformersTextEmbedder.from_dict(data)
104122
assert component.model_name_or_path == "model"
105123
assert component.device == "cuda"
106-
assert component.use_auth_token is True
124+
assert component.token is True
107125
assert component.prefix == "prefix"
108126
assert component.suffix == "suffix"
109127
assert component.batch_size == 64

0 commit comments

Comments
 (0)