Merge branch 'master' into v3.2-release; version to 3.2.1

tomaarsen · tomaarsen · commit dd76c033ac21 · 2024-10-21T13:13:03.000+02:00
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -63,6 +63,10 @@ jobs:
           python -m pip install --upgrade pip
           python -m pip install '.[train, onnx, openvino, dev]'
 
+      - name: Install model2vec
+        run: python -m pip install model2vec
+        if: ${{ contains(fromJSON('["3.10", "3.11", "3.12"]'), matrix.python-version) }}
+
       - name: Run unit tests
         run: |
           python -m pytest --durations 20 -sv tests/
diff --git a/examples/applications/embedding-quantization/semantic_search_usearch.py b/examples/applications/embedding-quantization/semantic_search_usearch.py
@@ -6,7 +6,7 @@
 from sentence_transformers.quantization import quantize_embeddings, semantic_search_usearch
 
 # 1. Load the quora corpus with questions
-dataset = load_dataset("quora", split="train").map(
+dataset = load_dataset("quora", split="train", trust_remote_code=True).map(
     lambda batch: {"text": [text for sample in batch["questions"] for text in sample["text"]]},
     batched=True,
     remove_columns=["questions", "is_duplicate"],
@@ -26,7 +26,7 @@
 # 4. Choose a target precision for the corpus embeddings
 corpus_precision = "binary"
 # Valid options are: "float32", "uint8", "int8", "ubinary", and "binary"
-# But usearch only supports "float32", "int8", and "binary"
+# But usearch only supports "float32", "int8", "binary" and "ubinary"
 
 # 5. Encode the corpus
 full_corpus_embeddings = model.encode(corpus, normalize_embeddings=True, show_progress_bar=True)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "sentence-transformers"
-version = "3.2.0"
+version = "3.2.1"
 description = "State-of-the-Art Text Embeddings"
 license = { text = "Apache 2.0" }
 readme = "README.md"
@@ -49,8 +49,8 @@ Repository = "https://github.com/UKPLab/sentence-transformers/"
 
 [project.optional-dependencies]
 train = ["datasets", "accelerate>=0.20.3"]
-onnx = ["optimum[onnxruntime]>=1.23.0"]
-onnx-gpu = ["optimum[onnxruntime-gpu]>=1.23.0"]
+onnx = ["optimum[onnxruntime]>=1.23.1"]
+onnx-gpu = ["optimum[onnxruntime-gpu]>=1.23.1"]
 openvino = ["optimum-intel[openvino]>=1.20.0"]
 dev = ["datasets", "accelerate>=0.20.3", "pre-commit", "pytest", "pytest-cov"]
 
@@ -100,4 +100,4 @@ testpaths = [
 addopts = "--strict-markers -m 'not slow'"
 markers = [
     "slow: marks tests as slow"
-]
+]
diff --git a/sentence_transformers/SentenceTransformer.py b/sentence_transformers/SentenceTransformer.py
@@ -1718,10 +1718,10 @@ def _load_sbert_model(
 
                 # Try to initialize the module with a lot of kwargs, but only if the module supports them
                 # Otherwise we fall back to the load method
-                # try:
-                module = module_class(model_name_or_path, cache_dir=cache_folder, backend=self.backend, **kwargs)
-                # except TypeError:
-                #     module = module_class.load(model_name_or_path)
+                try:
+                    module = module_class(model_name_or_path, cache_dir=cache_folder, backend=self.backend, **kwargs)
+                except TypeError:
+                    module = module_class.load(model_name_or_path)
             else:
                 # Normalize does not require any files to be loaded
                 if module_class == Normalize:
diff --git a/sentence_transformers/__init__.py b/sentence_transformers/__init__.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-__version__ = "3.2.0"
+__version__ = "3.2.1"
 __MODEL_HUB_ORGANIZATION__ = "sentence-transformers"
 
 import importlib
diff --git a/sentence_transformers/backend.py b/sentence_transformers/backend.py
@@ -78,7 +78,9 @@ def export_optimized_onnx_model(
         or not isinstance(model[0], Transformer)
         or not isinstance(model[0].auto_model, ORTModelForFeatureExtraction)
     ):
-        raise ValueError('The model must be a SentenceTransformer model loaded with `backend="onnx"`.')
+        raise ValueError(
+            'The model must be a Transformer-based SentenceTransformer model loaded with `backend="onnx"`.'
+        )
 
     ort_model: ORTModelForFeatureExtraction = model[0].auto_model
     optimizer = ORTOptimizer.from_pretrained(ort_model)
@@ -158,7 +160,9 @@ def export_dynamic_quantized_onnx_model(
         or not isinstance(model[0], Transformer)
         or not isinstance(model[0].auto_model, ORTModelForFeatureExtraction)
     ):
-        raise ValueError('The model must be a SentenceTransformer model loaded with `backend="onnx"`.')
+        raise ValueError(
+            'The model must be a Transformer-based SentenceTransformer model loaded with `backend="onnx"`.'
+        )
 
     ort_model: ORTModelForFeatureExtraction = model[0].auto_model
     quantizer = ORTQuantizer.from_pretrained(ort_model)
diff --git a/sentence_transformers/losses/CachedGISTEmbedLoss.py b/sentence_transformers/losses/CachedGISTEmbedLoss.py
@@ -10,7 +10,7 @@
 from torch.utils.checkpoint import get_device_states, set_device_states
 
 from sentence_transformers import SentenceTransformer
-from sentence_transformers.models import Transformer
+from sentence_transformers.models import StaticEmbedding, Transformer
 
 
 class RandContext:
@@ -139,6 +139,11 @@ def __init__(
                 trainer.train()
         """
         super().__init__()
+        if isinstance(model[0], StaticEmbedding):
+            raise ValueError(
+                "CachedGISTEmbedLoss is not compatible with a SentenceTransformer model based on a StaticEmbedding. "
+                "Consider using GISTEmbedLoss instead."
+            )
         self.model = model
         self.guide = guide
         self.temperature = temperature
diff --git a/sentence_transformers/losses/CachedMultipleNegativesRankingLoss.py b/sentence_transformers/losses/CachedMultipleNegativesRankingLoss.py
@@ -10,6 +10,7 @@
 from torch.utils.checkpoint import get_device_states, set_device_states
 
 from sentence_transformers import SentenceTransformer, util
+from sentence_transformers.models import StaticEmbedding
 
 
 class RandContext:
@@ -145,6 +146,12 @@ def __init__(
                 trainer.train()
         """
         super().__init__()
+        if isinstance(model[0], StaticEmbedding):
+            raise ValueError(
+                "CachedMultipleNegativesRankingLoss is not compatible with a SentenceTransformer model based on a StaticEmbedding. "
+                "Consider using MultipleNegativesRankingLoss instead."
+            )
+
         self.model = model
         self.scale = scale
         self.similarity_fct = similarity_fct
diff --git a/sentence_transformers/losses/CachedMultipleNegativesSymmetricRankingLoss.py b/sentence_transformers/losses/CachedMultipleNegativesSymmetricRankingLoss.py
@@ -10,6 +10,7 @@
 
 from sentence_transformers import SentenceTransformer, util
 from sentence_transformers.losses.CachedMultipleNegativesRankingLoss import RandContext
+from sentence_transformers.models import StaticEmbedding
 
 
 def _backward_hook(
@@ -114,6 +115,12 @@ def __init__(
             - Scaling Deep Contrastive Learning Batch Size under Memory Limited Setup: https://arxiv.org/pdf/2101.06983.pdf
         """
         super().__init__()
+        if isinstance(model[0], StaticEmbedding):
+            raise ValueError(
+                "CachedMultipleNegativesSymmetricRankingLoss is not compatible with a SentenceTransformer model based on a StaticEmbedding. "
+                "Consider using MultipleNegativesSymmetricRankingLoss instead."
+            )
+
         self.model = model
         self.scale = scale
         self.similarity_fct = similarity_fct
diff --git a/sentence_transformers/losses/DenoisingAutoEncoderLoss.py b/sentence_transformers/losses/DenoisingAutoEncoderLoss.py
@@ -7,6 +7,7 @@
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, PreTrainedModel
 
 from sentence_transformers import SentenceTransformer
+from sentence_transformers.models import StaticEmbedding
 
 logger = logging.getLogger(__name__)
 
@@ -73,6 +74,12 @@ def __init__(
                 )
         """
         super().__init__()
+
+        if isinstance(model[0], StaticEmbedding):
+            raise ValueError(
+                "DenoisingAutoEncoderLoss is not compatible with a SentenceTransformer model based on a StaticEmbedding."
+            )
+
         self.encoder = model  # This will be the final model used during the inference time.
         self.tokenizer_encoder = model.tokenizer
 
diff --git a/sentence_transformers/losses/GISTEmbedLoss.py b/sentence_transformers/losses/GISTEmbedLoss.py
@@ -5,7 +5,7 @@
 import torch
 from torch import Tensor, nn
 
-from sentence_transformers.models import Transformer
+from sentence_transformers.models import StaticEmbedding, Transformer
 from sentence_transformers.SentenceTransformer import SentenceTransformer
 
 
@@ -91,6 +91,12 @@ def __init__(
         if self.must_retokenize:
             self.tokenizer = self.model.tokenizer
 
+            if isinstance(self.model[0], StaticEmbedding):
+                raise ValueError(
+                    "If we must retokenize because the guide model has a different tokenizer, "
+                    "then the Sentence Transformer model must not be based on a StaticEmbedding."
+                )
+
     def sim_matrix(self, embed1: Tensor, embed2: Tensor) -> Tensor:
         return self.similarity_fct(embed1.unsqueeze(1), embed2.unsqueeze(0))
 
diff --git a/sentence_transformers/losses/Matryoshka2dLoss.py b/sentence_transformers/losses/Matryoshka2dLoss.py
@@ -95,21 +95,23 @@ def __init__(
         Example:
             ::
 
-                from sentence_transformers import SentenceTransformer, losses, InputExample
-                from torch.utils.data import DataLoader
+                from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, losses
+                from datasets import Dataset
 
                 model = SentenceTransformer("microsoft/mpnet-base")
-                train_examples = [
-                    InputExample(texts=['Anchor 1', 'Positive 1']),
-                    InputExample(texts=['Anchor 2', 'Positive 2']),
-                ]
-                train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
-                train_loss = losses.MultipleNegativesRankingLoss(model=model)
-                train_loss = losses.Matryoshka2dLoss(model, train_loss, [768, 512, 256, 128, 64])
-                model.fit(
-                    [(train_dataloader, train_loss)],
-                    epochs=10,
+                train_dataset = Dataset.from_dict({
+                    "anchor": ["It's nice weather outside today.", "He drove to work."],
+                    "positive": ["It's so sunny.", "He took the car to the office."],
+                })
+                loss = losses.MultipleNegativesRankingLoss(model)
+                loss = losses.Matryoshka2dLoss(model, loss, [768, 512, 256, 128, 64])
+
+                trainer = SentenceTransformerTrainer(
+                    model=model,
+                    train_dataset=train_dataset,
+                    loss=loss,
                 )
+                trainer.train()
         """
         matryoshka_loss = MatryoshkaLoss(
             model,
diff --git a/sentence_transformers/losses/MatryoshkaLoss.py b/sentence_transformers/losses/MatryoshkaLoss.py
@@ -101,21 +101,23 @@ def __init__(
         Example:
             ::
 
-                from sentence_transformers import SentenceTransformer, losses, InputExample
-                from torch.utils.data import DataLoader
+                from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, losses
+                from datasets import Dataset
 
                 model = SentenceTransformer("microsoft/mpnet-base")
-                train_examples = [
-                    InputExample(texts=['Anchor 1', 'Positive 1']),
-                    InputExample(texts=['Anchor 2', 'Positive 2']),
-                ]
-                train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
-                train_loss = losses.MultipleNegativesRankingLoss(model=model)
-                train_loss = losses.MatryoshkaLoss(model, train_loss, [768, 512, 256, 128, 64])
-                model.fit(
-                    [(train_dataloader, train_loss)],
-                    epochs=10,
+                train_dataset = Dataset.from_dict({
+                    "anchor": ["It's nice weather outside today.", "He drove to work."],
+                    "positive": ["It's so sunny.", "He took the car to the office."],
+                })
+                loss = losses.MultipleNegativesRankingLoss(model)
+                loss = losses.MatryoshkaLoss(model, loss, [768, 512, 256, 128, 64])
+
+                trainer = SentenceTransformerTrainer(
+                    model=model,
+                    train_dataset=train_dataset,
+                    loss=loss,
                 )
+                trainer.train()
         """
         super().__init__()
         self.model = model
diff --git a/sentence_transformers/losses/MegaBatchMarginLoss.py b/sentence_transformers/losses/MegaBatchMarginLoss.py
@@ -59,25 +59,30 @@ def __init__(
         Example:
             ::
 
-                from sentence_transformers import SentenceTransformer, InputExample, losses
-                from torch.utils.data import DataLoader
+                from sentence_transformers import SentenceTransformer, SentenceTransformerTrainingArguments, SentenceTransformerTrainer, losses
+                from datasets import Dataset
 
-                model = SentenceTransformer('all-MiniLM-L6-v2')
-
-                total_examples = 500
                 train_batch_size = 250
                 train_mini_batch_size = 32
 
-                train_examples = [
-                    InputExample(texts=[f"This is sentence number {i}", f"This is sentence number {i+1}"]) for i in range(total_examples)
-                ]
-                train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=train_batch_size)
-                train_loss = losses.MegaBatchMarginLoss(model=model, mini_batch_size=train_mini_batch_size)
-
-                model.fit(
-                    [(train_dataloader, train_loss)],
-                    epochs=10,
+                model = SentenceTransformer('all-MiniLM-L6-v2')
+                train_dataset = Dataset.from_dict({
+                    "anchor": [f"This is sentence number {i}" for i in range(500)],
+                    "positive": [f"This is sentence number {i}" for i in range(1, 501)],
+                })
+                loss = losses.MegaBatchMarginLoss(model=model, mini_batch_size=train_mini_batch_size)
+
+                args = SentenceTransformerTrainingArguments(
+                    output_dir="output",
+                    per_device_train_batch_size=train_batch_size,
+                )
+                trainer = SentenceTransformerTrainer(
+                    model=model,
+                    args=args,
+                    train_dataset=train_dataset,
+                    loss=loss,
                 )
+                trainer.train()
         """
         super().__init__()
         self.model = model
diff --git a/sentence_transformers/model_card.py b/sentence_transformers/model_card.py
@@ -423,10 +423,15 @@ def set_widget_examples(self, dataset: Dataset | DatasetDict) -> None:
             columns = [
                 column
                 for column, feature in dataset[dataset_name].features.items()
-                if isinstance(feature, Value) and feature.dtype == "string" and column != "dataset_name"
+                if isinstance(feature, Value)
+                and (feature.dtype == "string" or feature.dtype == "large_string")
+                and column != "dataset_name"
             ]
             str_dataset = dataset[dataset_name].select_columns(columns)
             dataset_size = len(str_dataset)
+            if dataset_size == 0:
+                continue
+
             lengths = {}
             for idx, sample in enumerate(
                 str_dataset.select(random.sample(range(dataset_size), k=min(num_samples_to_check, dataset_size)))
diff --git a/sentence_transformers/models/StaticEmbedding.py b/sentence_transformers/models/StaticEmbedding.py
@@ -159,9 +159,11 @@ def from_distillation(
         """
 
         try:
-            from model2vec import distill
+            from model2vec.distill import distill
         except ImportError:
-            raise ImportError("To use this method, please install the `model2vec` package: `pip install model2vec`")
+            raise ImportError(
+                "To use this method, please install the `model2vec` package: `pip install model2vec[distill]`"
+            )
 
         device = get_device_name()
         static_model = distill(
@@ -172,7 +174,10 @@ def from_distillation(
             apply_zipf=apply_zipf,
             use_subword=use_subword,
         )
-        embedding_weights = static_model.embedding.weight
+        if isinstance(static_model.embedding, np.ndarray):
+            embedding_weights = torch.from_numpy(static_model.embedding)
+        else:
+            embedding_weights = static_model.embedding.weight
         tokenizer: Tokenizer = static_model.tokenizer
 
         return cls(tokenizer, embedding_weights=embedding_weights, base_model=model_name)
@@ -200,7 +205,10 @@ def from_model2vec(cls, model_id_or_path: str) -> StaticEmbedding:
             raise ImportError("To use this method, please install the `model2vec` package: `pip install model2vec`")
 
         static_model = StaticModel.from_pretrained(model_id_or_path)
-        embedding_weights = static_model.embedding.weight
+        if isinstance(static_model.embedding, np.ndarray):
+            embedding_weights = torch.from_numpy(static_model.embedding)
+        else:
+            embedding_weights = static_model.embedding.weight
         tokenizer: Tokenizer = static_model.tokenizer
 
         return cls(tokenizer, embedding_weights=embedding_weights, base_model=model_id_or_path)
diff --git a/sentence_transformers/models/Transformer.py b/sentence_transformers/models/Transformer.py
@@ -155,7 +155,7 @@ def _load_openvino_model(self, model_name_or_path, config, cache_dir, **model_ar
         else:
             model_args["ov_config"] = {}
 
-        # Either load an exported model, or export the model to ONNX
+        # Either load an exported model, or export the model to OpenVINO
         self.auto_model: OVModelForFeatureExtraction = OVModelForFeatureExtraction.from_pretrained(
             model_name_or_path,
             config=config,
@@ -352,8 +352,8 @@ def forward(self, features: dict[str, torch.Tensor], **kwargs) -> dict[str, torc
 
         features.update({"token_embeddings": output_tokens, "attention_mask": features["attention_mask"]})
 
-        if self.auto_model.config.output_hidden_states:
-            all_layer_idx = 2
+        if self.auto_model.config.output_hidden_states and len(output_states) > 2:
+            all_layer_idx = 2  # I.e. after last_hidden_states and pooler_output
             if len(output_states) < 3:  # Some models only output last_hidden_states and all_hidden_states
                 all_layer_idx = 1
 
diff --git a/sentence_transformers/quantization.py b/sentence_transformers/quantization.py
diff --git a/sentence_transformers/trainer.py b/sentence_transformers/trainer.py
diff --git a/tests/models/test_static_embedding.py b/tests/models/test_static_embedding.py