Skip to content

Commit b408540

Browse files
Merge pull request #25 from x-tabdeveloping/e5encoder
E5Encoder, documentation dependencies, mkdocstrings
2 parents 923196b + 70b9879 commit b408540

5 files changed

Lines changed: 139 additions & 1 deletion

File tree

docs/encoders.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,11 @@ Turftopic currently supports OpenAI, Voyage and Cohere embeddings.
3333
:::turftopic.encoders.OpenAIEmbeddings
3434

3535
:::turftopic.encoders.VoyageEmbeddings
36+
37+
## E5 Embeddings
38+
39+
Most E5 models expect the input to be prefixed with something like `"query: "` (see the [multilingual-e5-small](https://huggingface.co/intfloat/multilingual-e5-small) model card).
40+
In instructional E5 models, it is also possible to add an instruction, following the format `f"Instruct: {task_description} \nQuery: {document}"` (see the [multilingual-e5-large-instruct](https://huggingface.co/intfloat/multilingual-e5-large-instruct) model card).
41+
In Turftopic, E5 embeddings including the prefixing is handled by the `E5Encoder`.
42+
43+
:::turftopic.encoders.E5Encoder

mkdocs.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@ plugins:
4242
show_symbol_type_heading: true
4343
docstring_style: numpy
4444
heading_level: 3
45-
custom_templates: templates - content.code.select
4645

4746
markdown_extensions:
4847
- pymdownx.highlight:

pyproject.toml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,13 @@ pyro-ppl = { version = "^1.8.0", optional = true }
2222
[tool.poetry.extras]
2323
pyro-ppl = ["pyro-ppl"]
2424

25+
[tool.poetry.group.docs]
26+
optional = true
27+
28+
[tool.poetry.group.docs.dependencies]
29+
mkdocs = "^1.5.2"
30+
mkdocs-material = "^9.5.12"
31+
mkdocstrings = { version = "^0.24.0", extras = ["python"] }
2532

2633
[build-system]
2734
requires = ["poetry-core"]

turftopic/encoders/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@
22
from turftopic.encoders.cohere import CohereEmbeddings
33
from turftopic.encoders.openai import OpenAIEmbeddings
44
from turftopic.encoders.voyage import VoyageEmbeddings
5+
from turftopic.encoders.e5 import E5Encoder
56

67
__all__ = [
78
"CohereEmbeddings",
89
"OpenAIEmbeddings",
910
"VoyageEmbeddings",
1011
"ExternalEncoder",
12+
"E5Encoder"
1113
]

turftopic/encoders/e5.py

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
import warnings
2+
from typing import Callable, Optional
3+
4+
import numpy as np
5+
from sentence_transformers import SentenceTransformer
6+
7+
8+
class E5Encoder(SentenceTransformer):
9+
"""Encoder model oriented at using E5 models.
10+
```python
11+
from turftopic.encoders import E5Encoder
12+
from turftopic import GMM
13+
model = GMM(10, encoder=E5Encoder(model_name="intfloat/multilingual-e5-small", prefix="query: "))
14+
```
15+
Parameters
16+
----------
17+
model_name: str
18+
Embedding model to use.
19+
Either a SentenceTransformers pre-trained models or a model from HuggingFace Hub.
20+
prefix : Optional[str]
21+
A string that gets added to the start of each document (formats each document followingly: `f"{prefix}{text}"`).
22+
Expected by most E5 models. Consult model cards on Hugging Face to see what prefix is expected by your specific model.
23+
preprocessor : Optional[Callable]
24+
A function that formats documents as desired.
25+
Overwrites `prefix` and only applies if `prefix == None`.
26+
Both input and output must be string.
27+
First argument must be input text.
28+
By default `None`.
29+
Examples
30+
--------
31+
Instructional models can also be used.
32+
In this case, the documents should be prefixed with a one-sentence instruction that describes the task.
33+
See Notes for available models and instruction suggestions.
34+
```python
35+
from turftopic.encoders import E5Encoder
36+
def add_instruct_prefix(document: str) -> str:
37+
task_description = "YOUR_INSTRUCTION"
38+
return f'Instruct: {task_description}\nQuery: {document}'
39+
encoder = E5Encoder(model_name="intfloat/multilingual-e5-large-instruct", preprocessor=add_instruct_prefix)
40+
model = GMM(10, encoder=encoder)
41+
```
42+
Or the same can be done using a `prefix` argument:
43+
```python
44+
from turftopic.encoders import E5Encoder
45+
from turftopic import GMM
46+
prefix = "Instruct: YOUR_INSTRUCTION\nQuery: "
47+
encoder = E5Encoder(model_name="intfloat/multilingual-e5-large-instruct", prefix=prefix)
48+
model = GMM(10, encoder=encoder)
49+
```
50+
Notes
51+
-----
52+
See available E5-based sentence transformers on Hugging Face Hub:
53+
https://huggingface.co/models?library=sentence-transformers&sort=trending&search=e5
54+
Instruction templates:
55+
https://github.com/microsoft/unilm/blob/9c0f1ff7ca53431fe47d2637dfe253643d94185b/e5/utils.py#L106
56+
"""
57+
58+
def __init__(
59+
self,
60+
model_name: str,
61+
prefix: Optional[str] = None,
62+
preprocessor: Optional[Callable] = None,
63+
**kwargs,
64+
):
65+
super().__init__(model_name, **kwargs)
66+
67+
# check for both prefix and preprocessor being specified
68+
if prefix is not None and preprocessor is not None:
69+
warnings.warn(
70+
"Both `prefix` and `preprocessor` are specified. `preprocessor` will be ignored! "
71+
"To avoid this warning, specify only one of them.",
72+
)
73+
74+
# pick either prefix or preprocessor to do the job
75+
if prefix is not None:
76+
self.preprocessor = lambda x: f"{prefix}{x}"
77+
else:
78+
if preprocessor is not None:
79+
try:
80+
assert self._is_preprocessor_valid(
81+
preprocessor=preprocessor
82+
)
83+
self.preprocessor = preprocessor
84+
except:
85+
raise AssertionError(
86+
"`preprocessor` failed vaildation. Please make sure your preprocessor returns type `str`."
87+
)
88+
else:
89+
raise ValueError(
90+
"Either `prefix` or `preprocessor` must be specified."
91+
)
92+
93+
@staticmethod
94+
def _is_preprocessor_valid(preprocessor: Callable) -> bool:
95+
"""Check if preprocessor returns string."""
96+
input_0 = None
97+
input_1 = "What are assertions? and why would you use them?"
98+
output_0 = preprocessor(input_0)
99+
output_1 = preprocessor(input_1)
100+
101+
return all(
102+
[
103+
isinstance(output_0, str),
104+
isinstance(output_1, str),
105+
]
106+
)
107+
108+
def encode(self, sentences: list[str], **kwargs) -> np.ndarray:
109+
"""
110+
Parameters
111+
----------
112+
sentences: list[str]
113+
Input text.
114+
Notes
115+
-----
116+
See docs for `SentenceTransformer.encode` for available **kwargs
117+
https://www.sbert.net/docs/package_reference/SentenceTransformer.html#sentence_transformers.SentenceTransformer.encode
118+
"""
119+
120+
sentences = [self.preprocessor(sentence) for sentence in sentences]
121+
122+
return super().encode(sentences, **kwargs)

0 commit comments

Comments
 (0)