Skip to content

Commit dd76c03

Browse files
committed
Merge branch 'master' into v3.2-release; version to 3.2.1
2 parents 539bf92 + f286d9f commit dd76c03

20 files changed

+219
-71
lines changed

.github/workflows/tests.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,10 @@ jobs:
6363
python -m pip install --upgrade pip
6464
python -m pip install '.[train, onnx, openvino, dev]'
6565
66+
- name: Install model2vec
67+
run: python -m pip install model2vec
68+
if: ${{ contains(fromJSON('["3.10", "3.11", "3.12"]'), matrix.python-version) }}
69+
6670
- name: Run unit tests
6771
run: |
6872
python -m pytest --durations 20 -sv tests/

examples/applications/embedding-quantization/semantic_search_usearch.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from sentence_transformers.quantization import quantize_embeddings, semantic_search_usearch
77

88
# 1. Load the quora corpus with questions
9-
dataset = load_dataset("quora", split="train").map(
9+
dataset = load_dataset("quora", split="train", trust_remote_code=True).map(
1010
lambda batch: {"text": [text for sample in batch["questions"] for text in sample["text"]]},
1111
batched=True,
1212
remove_columns=["questions", "is_duplicate"],
@@ -26,7 +26,7 @@
2626
# 4. Choose a target precision for the corpus embeddings
2727
corpus_precision = "binary"
2828
# Valid options are: "float32", "uint8", "int8", "ubinary", and "binary"
29-
# But usearch only supports "float32", "int8", and "binary"
29+
# But usearch only supports "float32", "int8", "binary" and "ubinary"
3030

3131
# 5. Encode the corpus
3232
full_corpus_embeddings = model.encode(corpus, normalize_embeddings=True, show_progress_bar=True)

pyproject.toml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "sentence-transformers"
3-
version = "3.2.0"
3+
version = "3.2.1"
44
description = "State-of-the-Art Text Embeddings"
55
license = { text = "Apache 2.0" }
66
readme = "README.md"
@@ -49,8 +49,8 @@ Repository = "https://github.com/UKPLab/sentence-transformers/"
4949

5050
[project.optional-dependencies]
5151
train = ["datasets", "accelerate>=0.20.3"]
52-
onnx = ["optimum[onnxruntime]>=1.23.0"]
53-
onnx-gpu = ["optimum[onnxruntime-gpu]>=1.23.0"]
52+
onnx = ["optimum[onnxruntime]>=1.23.1"]
53+
onnx-gpu = ["optimum[onnxruntime-gpu]>=1.23.1"]
5454
openvino = ["optimum-intel[openvino]>=1.20.0"]
5555
dev = ["datasets", "accelerate>=0.20.3", "pre-commit", "pytest", "pytest-cov"]
5656

@@ -100,4 +100,4 @@ testpaths = [
100100
addopts = "--strict-markers -m 'not slow'"
101101
markers = [
102102
"slow: marks tests as slow"
103-
]
103+
]

sentence_transformers/SentenceTransformer.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1718,10 +1718,10 @@ def _load_sbert_model(
17181718

17191719
# Try to initialize the module with a lot of kwargs, but only if the module supports them
17201720
# Otherwise we fall back to the load method
1721-
# try:
1722-
module = module_class(model_name_or_path, cache_dir=cache_folder, backend=self.backend, **kwargs)
1723-
# except TypeError:
1724-
# module = module_class.load(model_name_or_path)
1721+
try:
1722+
module = module_class(model_name_or_path, cache_dir=cache_folder, backend=self.backend, **kwargs)
1723+
except TypeError:
1724+
module = module_class.load(model_name_or_path)
17251725
else:
17261726
# Normalize does not require any files to be loaded
17271727
if module_class == Normalize:

sentence_transformers/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from __future__ import annotations
22

3-
__version__ = "3.2.0"
3+
__version__ = "3.2.1"
44
__MODEL_HUB_ORGANIZATION__ = "sentence-transformers"
55

66
import importlib

sentence_transformers/backend.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,9 @@ def export_optimized_onnx_model(
7878
or not isinstance(model[0], Transformer)
7979
or not isinstance(model[0].auto_model, ORTModelForFeatureExtraction)
8080
):
81-
raise ValueError('The model must be a SentenceTransformer model loaded with `backend="onnx"`.')
81+
raise ValueError(
82+
'The model must be a Transformer-based SentenceTransformer model loaded with `backend="onnx"`.'
83+
)
8284

8385
ort_model: ORTModelForFeatureExtraction = model[0].auto_model
8486
optimizer = ORTOptimizer.from_pretrained(ort_model)
@@ -158,7 +160,9 @@ def export_dynamic_quantized_onnx_model(
158160
or not isinstance(model[0], Transformer)
159161
or not isinstance(model[0].auto_model, ORTModelForFeatureExtraction)
160162
):
161-
raise ValueError('The model must be a SentenceTransformer model loaded with `backend="onnx"`.')
163+
raise ValueError(
164+
'The model must be a Transformer-based SentenceTransformer model loaded with `backend="onnx"`.'
165+
)
162166

163167
ort_model: ORTModelForFeatureExtraction = model[0].auto_model
164168
quantizer = ORTQuantizer.from_pretrained(ort_model)

sentence_transformers/losses/CachedGISTEmbedLoss.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from torch.utils.checkpoint import get_device_states, set_device_states
1111

1212
from sentence_transformers import SentenceTransformer
13-
from sentence_transformers.models import Transformer
13+
from sentence_transformers.models import StaticEmbedding, Transformer
1414

1515

1616
class RandContext:
@@ -139,6 +139,11 @@ def __init__(
139139
trainer.train()
140140
"""
141141
super().__init__()
142+
if isinstance(model[0], StaticEmbedding):
143+
raise ValueError(
144+
"CachedGISTEmbedLoss is not compatible with a SentenceTransformer model based on a StaticEmbedding. "
145+
"Consider using GISTEmbedLoss instead."
146+
)
142147
self.model = model
143148
self.guide = guide
144149
self.temperature = temperature

sentence_transformers/losses/CachedMultipleNegativesRankingLoss.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from torch.utils.checkpoint import get_device_states, set_device_states
1111

1212
from sentence_transformers import SentenceTransformer, util
13+
from sentence_transformers.models import StaticEmbedding
1314

1415

1516
class RandContext:
@@ -145,6 +146,12 @@ def __init__(
145146
trainer.train()
146147
"""
147148
super().__init__()
149+
if isinstance(model[0], StaticEmbedding):
150+
raise ValueError(
151+
"CachedMultipleNegativesRankingLoss is not compatible with a SentenceTransformer model based on a StaticEmbedding. "
152+
"Consider using MultipleNegativesRankingLoss instead."
153+
)
154+
148155
self.model = model
149156
self.scale = scale
150157
self.similarity_fct = similarity_fct

sentence_transformers/losses/CachedMultipleNegativesSymmetricRankingLoss.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
from sentence_transformers import SentenceTransformer, util
1212
from sentence_transformers.losses.CachedMultipleNegativesRankingLoss import RandContext
13+
from sentence_transformers.models import StaticEmbedding
1314

1415

1516
def _backward_hook(
@@ -114,6 +115,12 @@ def __init__(
114115
- Scaling Deep Contrastive Learning Batch Size under Memory Limited Setup: https://arxiv.org/pdf/2101.06983.pdf
115116
"""
116117
super().__init__()
118+
if isinstance(model[0], StaticEmbedding):
119+
raise ValueError(
120+
"CachedMultipleNegativesSymmetricRankingLoss is not compatible with a SentenceTransformer model based on a StaticEmbedding. "
121+
"Consider using MultipleNegativesSymmetricRankingLoss instead."
122+
)
123+
117124
self.model = model
118125
self.scale = scale
119126
self.similarity_fct = similarity_fct

sentence_transformers/losses/DenoisingAutoEncoderLoss.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, PreTrainedModel
88

99
from sentence_transformers import SentenceTransformer
10+
from sentence_transformers.models import StaticEmbedding
1011

1112
logger = logging.getLogger(__name__)
1213

@@ -73,6 +74,12 @@ def __init__(
7374
)
7475
"""
7576
super().__init__()
77+
78+
if isinstance(model[0], StaticEmbedding):
79+
raise ValueError(
80+
"DenoisingAutoEncoderLoss is not compatible with a SentenceTransformer model based on a StaticEmbedding."
81+
)
82+
7683
self.encoder = model # This will be the final model used during the inference time.
7784
self.tokenizer_encoder = model.tokenizer
7885

sentence_transformers/losses/GISTEmbedLoss.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import torch
66
from torch import Tensor, nn
77

8-
from sentence_transformers.models import Transformer
8+
from sentence_transformers.models import StaticEmbedding, Transformer
99
from sentence_transformers.SentenceTransformer import SentenceTransformer
1010

1111

@@ -91,6 +91,12 @@ def __init__(
9191
if self.must_retokenize:
9292
self.tokenizer = self.model.tokenizer
9393

94+
if isinstance(self.model[0], StaticEmbedding):
95+
raise ValueError(
96+
"If we must retokenize because the guide model has a different tokenizer, "
97+
"then the Sentence Transformer model must not be based on a StaticEmbedding."
98+
)
99+
94100
def sim_matrix(self, embed1: Tensor, embed2: Tensor) -> Tensor:
95101
return self.similarity_fct(embed1.unsqueeze(1), embed2.unsqueeze(0))
96102

sentence_transformers/losses/Matryoshka2dLoss.py

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -95,21 +95,23 @@ def __init__(
9595
Example:
9696
::
9797
98-
from sentence_transformers import SentenceTransformer, losses, InputExample
99-
from torch.utils.data import DataLoader
98+
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, losses
99+
from datasets import Dataset
100100
101101
model = SentenceTransformer("microsoft/mpnet-base")
102-
train_examples = [
103-
InputExample(texts=['Anchor 1', 'Positive 1']),
104-
InputExample(texts=['Anchor 2', 'Positive 2']),
105-
]
106-
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
107-
train_loss = losses.MultipleNegativesRankingLoss(model=model)
108-
train_loss = losses.Matryoshka2dLoss(model, train_loss, [768, 512, 256, 128, 64])
109-
model.fit(
110-
[(train_dataloader, train_loss)],
111-
epochs=10,
102+
train_dataset = Dataset.from_dict({
103+
"anchor": ["It's nice weather outside today.", "He drove to work."],
104+
"positive": ["It's so sunny.", "He took the car to the office."],
105+
})
106+
loss = losses.MultipleNegativesRankingLoss(model)
107+
loss = losses.Matryoshka2dLoss(model, loss, [768, 512, 256, 128, 64])
108+
109+
trainer = SentenceTransformerTrainer(
110+
model=model,
111+
train_dataset=train_dataset,
112+
loss=loss,
112113
)
114+
trainer.train()
113115
"""
114116
matryoshka_loss = MatryoshkaLoss(
115117
model,

sentence_transformers/losses/MatryoshkaLoss.py

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -101,21 +101,23 @@ def __init__(
101101
Example:
102102
::
103103
104-
from sentence_transformers import SentenceTransformer, losses, InputExample
105-
from torch.utils.data import DataLoader
104+
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, losses
105+
from datasets import Dataset
106106
107107
model = SentenceTransformer("microsoft/mpnet-base")
108-
train_examples = [
109-
InputExample(texts=['Anchor 1', 'Positive 1']),
110-
InputExample(texts=['Anchor 2', 'Positive 2']),
111-
]
112-
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
113-
train_loss = losses.MultipleNegativesRankingLoss(model=model)
114-
train_loss = losses.MatryoshkaLoss(model, train_loss, [768, 512, 256, 128, 64])
115-
model.fit(
116-
[(train_dataloader, train_loss)],
117-
epochs=10,
108+
train_dataset = Dataset.from_dict({
109+
"anchor": ["It's nice weather outside today.", "He drove to work."],
110+
"positive": ["It's so sunny.", "He took the car to the office."],
111+
})
112+
loss = losses.MultipleNegativesRankingLoss(model)
113+
loss = losses.MatryoshkaLoss(model, loss, [768, 512, 256, 128, 64])
114+
115+
trainer = SentenceTransformerTrainer(
116+
model=model,
117+
train_dataset=train_dataset,
118+
loss=loss,
118119
)
120+
trainer.train()
119121
"""
120122
super().__init__()
121123
self.model = model

sentence_transformers/losses/MegaBatchMarginLoss.py

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -59,25 +59,30 @@ def __init__(
5959
Example:
6060
::
6161
62-
from sentence_transformers import SentenceTransformer, InputExample, losses
63-
from torch.utils.data import DataLoader
62+
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainingArguments, SentenceTransformerTrainer, losses
63+
from datasets import Dataset
6464
65-
model = SentenceTransformer('all-MiniLM-L6-v2')
66-
67-
total_examples = 500
6865
train_batch_size = 250
6966
train_mini_batch_size = 32
7067
71-
train_examples = [
72-
InputExample(texts=[f"This is sentence number {i}", f"This is sentence number {i+1}"]) for i in range(total_examples)
73-
]
74-
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=train_batch_size)
75-
train_loss = losses.MegaBatchMarginLoss(model=model, mini_batch_size=train_mini_batch_size)
76-
77-
model.fit(
78-
[(train_dataloader, train_loss)],
79-
epochs=10,
68+
model = SentenceTransformer('all-MiniLM-L6-v2')
69+
train_dataset = Dataset.from_dict({
70+
"anchor": [f"This is sentence number {i}" for i in range(500)],
71+
"positive": [f"This is sentence number {i}" for i in range(1, 501)],
72+
})
73+
loss = losses.MegaBatchMarginLoss(model=model, mini_batch_size=train_mini_batch_size)
74+
75+
args = SentenceTransformerTrainingArguments(
76+
output_dir="output",
77+
per_device_train_batch_size=train_batch_size,
78+
)
79+
trainer = SentenceTransformerTrainer(
80+
model=model,
81+
args=args,
82+
train_dataset=train_dataset,
83+
loss=loss,
8084
)
85+
trainer.train()
8186
"""
8287
super().__init__()
8388
self.model = model

sentence_transformers/model_card.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -423,10 +423,15 @@ def set_widget_examples(self, dataset: Dataset | DatasetDict) -> None:
423423
columns = [
424424
column
425425
for column, feature in dataset[dataset_name].features.items()
426-
if isinstance(feature, Value) and feature.dtype == "string" and column != "dataset_name"
426+
if isinstance(feature, Value)
427+
and (feature.dtype == "string" or feature.dtype == "large_string")
428+
and column != "dataset_name"
427429
]
428430
str_dataset = dataset[dataset_name].select_columns(columns)
429431
dataset_size = len(str_dataset)
432+
if dataset_size == 0:
433+
continue
434+
430435
lengths = {}
431436
for idx, sample in enumerate(
432437
str_dataset.select(random.sample(range(dataset_size), k=min(num_samples_to_check, dataset_size)))

sentence_transformers/models/StaticEmbedding.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -159,9 +159,11 @@ def from_distillation(
159159
"""
160160

161161
try:
162-
from model2vec import distill
162+
from model2vec.distill import distill
163163
except ImportError:
164-
raise ImportError("To use this method, please install the `model2vec` package: `pip install model2vec`")
164+
raise ImportError(
165+
"To use this method, please install the `model2vec` package: `pip install model2vec[distill]`"
166+
)
165167

166168
device = get_device_name()
167169
static_model = distill(
@@ -172,7 +174,10 @@ def from_distillation(
172174
apply_zipf=apply_zipf,
173175
use_subword=use_subword,
174176
)
175-
embedding_weights = static_model.embedding.weight
177+
if isinstance(static_model.embedding, np.ndarray):
178+
embedding_weights = torch.from_numpy(static_model.embedding)
179+
else:
180+
embedding_weights = static_model.embedding.weight
176181
tokenizer: Tokenizer = static_model.tokenizer
177182

178183
return cls(tokenizer, embedding_weights=embedding_weights, base_model=model_name)
@@ -200,7 +205,10 @@ def from_model2vec(cls, model_id_or_path: str) -> StaticEmbedding:
200205
raise ImportError("To use this method, please install the `model2vec` package: `pip install model2vec`")
201206

202207
static_model = StaticModel.from_pretrained(model_id_or_path)
203-
embedding_weights = static_model.embedding.weight
208+
if isinstance(static_model.embedding, np.ndarray):
209+
embedding_weights = torch.from_numpy(static_model.embedding)
210+
else:
211+
embedding_weights = static_model.embedding.weight
204212
tokenizer: Tokenizer = static_model.tokenizer
205213

206214
return cls(tokenizer, embedding_weights=embedding_weights, base_model=model_id_or_path)

sentence_transformers/models/Transformer.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ def _load_openvino_model(self, model_name_or_path, config, cache_dir, **model_ar
155155
else:
156156
model_args["ov_config"] = {}
157157

158-
# Either load an exported model, or export the model to ONNX
158+
# Either load an exported model, or export the model to OpenVINO
159159
self.auto_model: OVModelForFeatureExtraction = OVModelForFeatureExtraction.from_pretrained(
160160
model_name_or_path,
161161
config=config,
@@ -352,8 +352,8 @@ def forward(self, features: dict[str, torch.Tensor], **kwargs) -> dict[str, torc
352352

353353
features.update({"token_embeddings": output_tokens, "attention_mask": features["attention_mask"]})
354354

355-
if self.auto_model.config.output_hidden_states:
356-
all_layer_idx = 2
355+
if self.auto_model.config.output_hidden_states and len(output_states) > 2:
356+
all_layer_idx = 2 # I.e. after last_hidden_states and pooler_output
357357
if len(output_states) < 3: # Some models only output last_hidden_states and all_hidden_states
358358
all_layer_idx = 1
359359

0 commit comments

Comments
 (0)