Skip to content

API Reference

EmBuddy Core

Source code in src/embuddy/core.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
class EmBuddy:
    def __init__(self, model_name: str):
        """A buddy for using text embeddings.

        Args:
            model_name (str): SentenceTransformer model used for embedding
        """
        self.model_name = model_name
        self.model: SentenceTransformer = SentenceTransformer(model_name)
        self.doc_cache: List[str] = []
        self.embedding_cache: np.ndarray = np.empty(
            shape=(0, self.model.get_sentence_embedding_dimension())
        )
        self.ann: Optional[NNDescent] = None
        self.umap_embeddings: Optional[np.ndarray] = None
        self._last_built_len: int = 0

    def embed(self, docs: Union[str, List[str]], cache: bool = True, show_progress_bar: bool = True) -> np.ndarray:
        """Embed documents.

        Args:
            docs (Union[str, List[str]]): A string or list of strings to embed
            cache (bool, optional): Whether to cache embedding results. Defaults to True.
            show_progress_bar (bool, optional): Show progress bar for sentence-transformers. Defaults to True.

        Returns:
            np.ndarray: Embeddings of input documents.
        """
        if isinstance(docs, str):
            docs = [docs]

        if cache:
            result = np.empty(
                shape=(len(docs), self.model.get_sentence_embedding_dimension()),
                dtype=np.float32,
            )
            uncached_docs = list(set([d for d in docs if d not in self.doc_cache]))
            if uncached_docs:
                self.doc_cache.extend(uncached_docs)
                uncached_embeddings = self.model.encode(
                    uncached_docs, show_progress_bar=show_progress_bar
                )
                self.embedding_cache = np.append(
                    self.embedding_cache, uncached_embeddings, axis=0
                )
                for i, doc in enumerate(docs):
                    if doc in uncached_docs:
                        result[i] = uncached_embeddings[uncached_docs.index(doc)]
                    else:
                        result[i] = self.embedding_cache[self.doc_cache.index(doc)]
            else:
                for i, doc in enumerate(docs):
                    result[i] = self.embedding_cache[self.doc_cache.index(doc)]
        else:
            result = self.model.encode(docs, show_progress_bar=show_progress_bar)
        return result

    def __call__(self, docs: Union[str, List[str]], cache: bool = True) -> np.ndarray:
        """Shortcut for [EmBuddy.embed][src.embuddy.core.EmBuddy.embed]"""
        return self.embed(docs=docs, cache=cache)

    def save(self, path: Union[str, Path], overwrite: bool = True) -> zarr.Group:
        """Save the current state of EmBuddy to disk.

        Embeddings and Docs arrays are saved and compressed using zarr.
        The ANN Index, if it exists, is saved using joblib with `zstd` compression.

        Note that this is a directory containing the required data.

        Args:
            path (Union[str, Path]): Location to save EmBuddy data
            overwrite (bool, optional): Whether to overwrite existing data. Defaults to True.

        Returns:
            zarr.Group: Group object containing an `embedding` array of the
            embeddings and a `docs` array of the docments
        """
        if isinstance(path, str):
            path = Path(path)

        g = zarr.group(store=str(path), overwrite=overwrite)
        g.attrs["model_name"] = self.model_name
        g.create_dataset(name="embeddings", data=self.embedding_cache)
        g.create_dataset(name="docs", data=self.doc_cache, dtype=str)
        if self.umap_embeddings is not None:
            g.create_dataset(name="umap_embeddings", data=self.umap_embeddings)
        if self.ann is not None:
            joblib.dump(self.ann, str(path / "ann_index.zstd"), compress=("zstd", 5))
        return g

    @classmethod
    def load(cls, path: Union[str, Path]) -> "EmBuddy":
        """Load a previously saved EmBuddy from disk

        Returns:
            EmBuddy: A loaded instance of Embuddy
        """
        if isinstance(path, str):
            path = Path(path)

        g = zarr.open_group(str(path))
        model_name = g.attrs["model_name"]
        embuddy = EmBuddy(model_name)
        embuddy.doc_cache = list(g["docs"])
        embuddy.embedding_cache = g["embeddings"][:]
        umap_check = g.get("umap_embeddings")
        if umap_check:
            embuddy.umap_embeddings = g["umap_embeddings"]
        if (path / "ann_index.zstd").exists():
            embuddy.ann = joblib.load(str(path / "ann_index.zstd"))
        return embuddy

    def build_ann_index(
        self, nndescent_kwargs: Optional[Dict[str, Any]] = None
    ) -> None:
        """Builds the Approximate Nearest Neighbors (ANN) index

        Args:
            nndescent_kwargs (Optional[Dict[str, Any]], optional): Optional kwargs to pass to NNDescent.
                Defaults to None.

        Raises:
            NNDescentHyperplaneError: If ANN can't be built due to small data.
        """
        nndescent_kwargs = _build_nndescent_kwargs_dict(nndescent_kwargs)
        nndescent_kwargs["n_neighbors"] = (
            10 if len(self.doc_cache) < 60 else nndescent_kwargs["n_neighbors"]
        )
        try:
            index = NNDescent(self.embedding_cache, **nndescent_kwargs)
            index.prepare()
            self.ann = index
            self._last_built_len = len(self.doc_cache)
        except ValueError as e:
            if "hyperplane" in str(e):
                raise NNDescentHyperplaneError(
                    "NNDescent cannot build index."
                    " Usually this means data is too small -"
                    " try adding more embeddings and build again."
                )
            else:
                raise e  # pragma: no cover

    def nearest_neighbors(
        self, docs: Union[str, List[str]], k: int = 10
    ) -> List[List[Tuple[int, str, float]]]:
        """Find the nearest neighbors (i.e. most similar) from
         cached docs for the input documents.

        Args:
            docs (Union[str, List[str]]): Docs to find nearest neighbors of.
            k (int, optional): Number of nearest neighbors. Defaults to 10.

        Raises:
            IndexNotBuiltError: If `build_ann_index` has not been run

        Returns:
            List[List[Tuple[int, str, float]]]: For each document, a list of tuples
                containing the document index, document string, and distance for the nearest
                neighbors for each input doc. Sorted by distance.
        """
        if self.ann is None or not isinstance(self.ann, NNDescent):
            raise IndexNotBuiltError(
                "Approximate Nearest Neighbors index not built."
                " Call `build_ann_index` before using this method."
            )
        if self._last_built_len < len(self.doc_cache):
            warn(
                f"{len(self.doc_cache)} embeddings exist in cache, "
                f"but ANN was last built with {self._last_built_len} embeddings. "
                "You are not querying all embeddings. You can rebuild the ANN "
                "index with `build_ann_index`."
            )
        if isinstance(docs, str):
            docs = [docs]

        query_data = self.embed(docs, cache=False).reshape(1, -1)
        neighbors, distances = self.ann.query(query_data, k=k)

        result = []
        for i, doc in enumerate(docs):
            neighbor_ix = neighbors[i]
            neighbor_docs = [self.doc_cache[ix] for ix in neighbor_ix]
            neighbor_dist = distances[i]
            doc_result = []
            for (ix, doc, dist) in zip(neighbor_ix, neighbor_docs, neighbor_dist):
                doc_result.append((ix, doc, dist))
            result.append(doc_result)
        return result

    def nearest_neighbors_vector(
        self, query_vector: np.ndarray, k: int = 10
    ) -> List[List[Tuple[int, str, float]]]:
        """Find the nearest neighbors (i.e. most similar) from
         cached docs of the input vectors.

        You can use this to find similar documents to an arbitrary vector --
        e.g. a vector for a document that doesn't exist, or the mean vector for
        a collection of documents.

        Args:
            query_vector (np.ndarray): An array of vectors
                to find the nearest neighbors for.
            k (int, optional): Number of nearest neighbors. Defaults to 10.

        Raises:
            IndexNotBuiltError: If `build_ann_index` has not been run

        Returns:
            List[List[Tuple[int, str, float]]]: For each vector, a list of tuples
                containing the document index, document string, and distance for the nearest
                neighbors for each input doc. Sorted by distance.
        """
        if self.ann is None or not isinstance(self.ann, NNDescent):
            raise IndexNotBuiltError(
                "Approximate Nearest Neighbors index not built."
                " Call `build_ann_index` before using this method."
            )
        if self._last_built_len < len(self.doc_cache):
            warn(
                f"{len(self.doc_cache)} embeddings exist in cache, "
                f"but ANN was last built with {self._last_built_len} embeddings. "
                "You are not querying all embeddings. You can rebuild the ANN "
                "index with `build_ann_index`."
            )
        query_vector = query_vector.reshape(1, -1)

        neighbors, distances = self.ann.query(query_vector, k=k)

        result = []
        for i, doc in enumerate(query_vector):
            neighbor_ix = neighbors[i]
            neighbor_docs = [self.doc_cache[ix] for ix in neighbor_ix]
            neighbor_dist = distances[i]
            doc_result = []
            for ix, doc, dist in zip(neighbor_ix, neighbor_docs, neighbor_dist):
                doc_result.append((ix, doc, dist))
            result.append(doc_result)
        return result

    def build_umap(
        self, umap_kwargs: Optional[Dict[str, Any]] = None, return_array=True
    ) -> Optional[np.ndarray]:
        """Builds a 2D projection of the embeddings with UMAP
        default settings except metric="cosine".

        Args:
            umap_kwargs (Optional[Dict[str, Any]], optional): Custom UMAP kwargs. Defaults to None.
            return_array (bool, optional): Return the UMAP array. Defaults to True.

        """
        umap_kwargs = _build_umap_kwargs_dict(umap_kwargs)
        umap = UMAP(**umap_kwargs)
        self.umap_embeddings = umap.fit_transform(self.embedding_cache)
        if return_array:
            return self.umap_embeddings
        return None

__call__(docs, cache=True)

Shortcut for EmBuddy.embed

Source code in src/embuddy/core.py
75
76
77
def __call__(self, docs: Union[str, List[str]], cache: bool = True) -> np.ndarray:
    """Shortcut for [EmBuddy.embed][src.embuddy.core.EmBuddy.embed]"""
    return self.embed(docs=docs, cache=cache)

__init__(model_name)

A buddy for using text embeddings.

Parameters:

Name Type Description Default
model_name str

SentenceTransformer model used for embedding

required
Source code in src/embuddy/core.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
def __init__(self, model_name: str):
    """A buddy for using text embeddings.

    Args:
        model_name (str): SentenceTransformer model used for embedding
    """
    self.model_name = model_name
    self.model: SentenceTransformer = SentenceTransformer(model_name)
    self.doc_cache: List[str] = []
    self.embedding_cache: np.ndarray = np.empty(
        shape=(0, self.model.get_sentence_embedding_dimension())
    )
    self.ann: Optional[NNDescent] = None
    self.umap_embeddings: Optional[np.ndarray] = None
    self._last_built_len: int = 0

build_ann_index(nndescent_kwargs=None)

Builds the Approximate Nearest Neighbors (ANN) index

Parameters:

Name Type Description Default
nndescent_kwargs Optional[Dict[str, Any]]

Optional kwargs to pass to NNDescent. Defaults to None.

None

Raises:

Type Description
NNDescentHyperplaneError

If ANN can't be built due to small data.

Source code in src/embuddy/core.py
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
def build_ann_index(
    self, nndescent_kwargs: Optional[Dict[str, Any]] = None
) -> None:
    """Builds the Approximate Nearest Neighbors (ANN) index

    Args:
        nndescent_kwargs (Optional[Dict[str, Any]], optional): Optional kwargs to pass to NNDescent.
            Defaults to None.

    Raises:
        NNDescentHyperplaneError: If ANN can't be built due to small data.
    """
    nndescent_kwargs = _build_nndescent_kwargs_dict(nndescent_kwargs)
    nndescent_kwargs["n_neighbors"] = (
        10 if len(self.doc_cache) < 60 else nndescent_kwargs["n_neighbors"]
    )
    try:
        index = NNDescent(self.embedding_cache, **nndescent_kwargs)
        index.prepare()
        self.ann = index
        self._last_built_len = len(self.doc_cache)
    except ValueError as e:
        if "hyperplane" in str(e):
            raise NNDescentHyperplaneError(
                "NNDescent cannot build index."
                " Usually this means data is too small -"
                " try adding more embeddings and build again."
            )
        else:
            raise e  # pragma: no cover

build_umap(umap_kwargs=None, return_array=True)

Builds a 2D projection of the embeddings with UMAP default settings except metric="cosine".

Parameters:

Name Type Description Default
umap_kwargs Optional[Dict[str, Any]]

Custom UMAP kwargs. Defaults to None.

None
return_array bool

Return the UMAP array. Defaults to True.

True
Source code in src/embuddy/core.py
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
def build_umap(
    self, umap_kwargs: Optional[Dict[str, Any]] = None, return_array=True
) -> Optional[np.ndarray]:
    """Builds a 2D projection of the embeddings with UMAP
    default settings except metric="cosine".

    Args:
        umap_kwargs (Optional[Dict[str, Any]], optional): Custom UMAP kwargs. Defaults to None.
        return_array (bool, optional): Return the UMAP array. Defaults to True.

    """
    umap_kwargs = _build_umap_kwargs_dict(umap_kwargs)
    umap = UMAP(**umap_kwargs)
    self.umap_embeddings = umap.fit_transform(self.embedding_cache)
    if return_array:
        return self.umap_embeddings
    return None

embed(docs, cache=True, show_progress_bar=True)

Embed documents.

Parameters:

Name Type Description Default
docs Union[str, List[str]]

A string or list of strings to embed

required
cache bool

Whether to cache embedding results. Defaults to True.

True
show_progress_bar bool

Show progress bar for sentence-transformers. Defaults to True.

True

Returns:

Type Description
np.ndarray

np.ndarray: Embeddings of input documents.

Source code in src/embuddy/core.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def embed(self, docs: Union[str, List[str]], cache: bool = True, show_progress_bar: bool = True) -> np.ndarray:
    """Embed documents.

    Args:
        docs (Union[str, List[str]]): A string or list of strings to embed
        cache (bool, optional): Whether to cache embedding results. Defaults to True.
        show_progress_bar (bool, optional): Show progress bar for sentence-transformers. Defaults to True.

    Returns:
        np.ndarray: Embeddings of input documents.
    """
    if isinstance(docs, str):
        docs = [docs]

    if cache:
        result = np.empty(
            shape=(len(docs), self.model.get_sentence_embedding_dimension()),
            dtype=np.float32,
        )
        uncached_docs = list(set([d for d in docs if d not in self.doc_cache]))
        if uncached_docs:
            self.doc_cache.extend(uncached_docs)
            uncached_embeddings = self.model.encode(
                uncached_docs, show_progress_bar=show_progress_bar
            )
            self.embedding_cache = np.append(
                self.embedding_cache, uncached_embeddings, axis=0
            )
            for i, doc in enumerate(docs):
                if doc in uncached_docs:
                    result[i] = uncached_embeddings[uncached_docs.index(doc)]
                else:
                    result[i] = self.embedding_cache[self.doc_cache.index(doc)]
        else:
            for i, doc in enumerate(docs):
                result[i] = self.embedding_cache[self.doc_cache.index(doc)]
    else:
        result = self.model.encode(docs, show_progress_bar=show_progress_bar)
    return result

load(path) classmethod

Load a previously saved EmBuddy from disk

Returns:

Name Type Description
EmBuddy 'EmBuddy'

A loaded instance of Embuddy

Source code in src/embuddy/core.py
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
@classmethod
def load(cls, path: Union[str, Path]) -> "EmBuddy":
    """Load a previously saved EmBuddy from disk

    Returns:
        EmBuddy: A loaded instance of Embuddy
    """
    if isinstance(path, str):
        path = Path(path)

    g = zarr.open_group(str(path))
    model_name = g.attrs["model_name"]
    embuddy = EmBuddy(model_name)
    embuddy.doc_cache = list(g["docs"])
    embuddy.embedding_cache = g["embeddings"][:]
    umap_check = g.get("umap_embeddings")
    if umap_check:
        embuddy.umap_embeddings = g["umap_embeddings"]
    if (path / "ann_index.zstd").exists():
        embuddy.ann = joblib.load(str(path / "ann_index.zstd"))
    return embuddy

nearest_neighbors(docs, k=10)

Find the nearest neighbors (i.e. most similar) from cached docs for the input documents.

Parameters:

Name Type Description Default
docs Union[str, List[str]]

Docs to find nearest neighbors of.

required
k int

Number of nearest neighbors. Defaults to 10.

10

Raises:

Type Description
IndexNotBuiltError

If build_ann_index has not been run

Returns:

Type Description
List[List[Tuple[int, str, float]]]

List[List[Tuple[int, str, float]]]: For each document, a list of tuples containing the document index, document string, and distance for the nearest neighbors for each input doc. Sorted by distance.

Source code in src/embuddy/core.py
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
def nearest_neighbors(
    self, docs: Union[str, List[str]], k: int = 10
) -> List[List[Tuple[int, str, float]]]:
    """Find the nearest neighbors (i.e. most similar) from
     cached docs for the input documents.

    Args:
        docs (Union[str, List[str]]): Docs to find nearest neighbors of.
        k (int, optional): Number of nearest neighbors. Defaults to 10.

    Raises:
        IndexNotBuiltError: If `build_ann_index` has not been run

    Returns:
        List[List[Tuple[int, str, float]]]: For each document, a list of tuples
            containing the document index, document string, and distance for the nearest
            neighbors for each input doc. Sorted by distance.
    """
    if self.ann is None or not isinstance(self.ann, NNDescent):
        raise IndexNotBuiltError(
            "Approximate Nearest Neighbors index not built."
            " Call `build_ann_index` before using this method."
        )
    if self._last_built_len < len(self.doc_cache):
        warn(
            f"{len(self.doc_cache)} embeddings exist in cache, "
            f"but ANN was last built with {self._last_built_len} embeddings. "
            "You are not querying all embeddings. You can rebuild the ANN "
            "index with `build_ann_index`."
        )
    if isinstance(docs, str):
        docs = [docs]

    query_data = self.embed(docs, cache=False).reshape(1, -1)
    neighbors, distances = self.ann.query(query_data, k=k)

    result = []
    for i, doc in enumerate(docs):
        neighbor_ix = neighbors[i]
        neighbor_docs = [self.doc_cache[ix] for ix in neighbor_ix]
        neighbor_dist = distances[i]
        doc_result = []
        for (ix, doc, dist) in zip(neighbor_ix, neighbor_docs, neighbor_dist):
            doc_result.append((ix, doc, dist))
        result.append(doc_result)
    return result

nearest_neighbors_vector(query_vector, k=10)

Find the nearest neighbors (i.e. most similar) from cached docs of the input vectors.

You can use this to find similar documents to an arbitrary vector -- e.g. a vector for a document that doesn't exist, or the mean vector for a collection of documents.

Parameters:

Name Type Description Default
query_vector np.ndarray

An array of vectors to find the nearest neighbors for.

required
k int

Number of nearest neighbors. Defaults to 10.

10

Raises:

Type Description
IndexNotBuiltError

If build_ann_index has not been run

Returns:

Type Description
List[List[Tuple[int, str, float]]]

List[List[Tuple[int, str, float]]]: For each vector, a list of tuples containing the document index, document string, and distance for the nearest neighbors for each input doc. Sorted by distance.

Source code in src/embuddy/core.py
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
def nearest_neighbors_vector(
    self, query_vector: np.ndarray, k: int = 10
) -> List[List[Tuple[int, str, float]]]:
    """Find the nearest neighbors (i.e. most similar) from
     cached docs of the input vectors.

    You can use this to find similar documents to an arbitrary vector --
    e.g. a vector for a document that doesn't exist, or the mean vector for
    a collection of documents.

    Args:
        query_vector (np.ndarray): An array of vectors
            to find the nearest neighbors for.
        k (int, optional): Number of nearest neighbors. Defaults to 10.

    Raises:
        IndexNotBuiltError: If `build_ann_index` has not been run

    Returns:
        List[List[Tuple[int, str, float]]]: For each vector, a list of tuples
            containing the document index, document string, and distance for the nearest
            neighbors for each input doc. Sorted by distance.
    """
    if self.ann is None or not isinstance(self.ann, NNDescent):
        raise IndexNotBuiltError(
            "Approximate Nearest Neighbors index not built."
            " Call `build_ann_index` before using this method."
        )
    if self._last_built_len < len(self.doc_cache):
        warn(
            f"{len(self.doc_cache)} embeddings exist in cache, "
            f"but ANN was last built with {self._last_built_len} embeddings. "
            "You are not querying all embeddings. You can rebuild the ANN "
            "index with `build_ann_index`."
        )
    query_vector = query_vector.reshape(1, -1)

    neighbors, distances = self.ann.query(query_vector, k=k)

    result = []
    for i, doc in enumerate(query_vector):
        neighbor_ix = neighbors[i]
        neighbor_docs = [self.doc_cache[ix] for ix in neighbor_ix]
        neighbor_dist = distances[i]
        doc_result = []
        for ix, doc, dist in zip(neighbor_ix, neighbor_docs, neighbor_dist):
            doc_result.append((ix, doc, dist))
        result.append(doc_result)
    return result

save(path, overwrite=True)

Save the current state of EmBuddy to disk.

Embeddings and Docs arrays are saved and compressed using zarr. The ANN Index, if it exists, is saved using joblib with zstd compression.

Note that this is a directory containing the required data.

Parameters:

Name Type Description Default
path Union[str, Path]

Location to save EmBuddy data

required
overwrite bool

Whether to overwrite existing data. Defaults to True.

True

Returns:

Type Description
zarr.Group

zarr.Group: Group object containing an embedding array of the

zarr.Group

embeddings and a docs array of the docments

Source code in src/embuddy/core.py
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
def save(self, path: Union[str, Path], overwrite: bool = True) -> zarr.Group:
    """Save the current state of EmBuddy to disk.

    Embeddings and Docs arrays are saved and compressed using zarr.
    The ANN Index, if it exists, is saved using joblib with `zstd` compression.

    Note that this is a directory containing the required data.

    Args:
        path (Union[str, Path]): Location to save EmBuddy data
        overwrite (bool, optional): Whether to overwrite existing data. Defaults to True.

    Returns:
        zarr.Group: Group object containing an `embedding` array of the
        embeddings and a `docs` array of the docments
    """
    if isinstance(path, str):
        path = Path(path)

    g = zarr.group(store=str(path), overwrite=overwrite)
    g.attrs["model_name"] = self.model_name
    g.create_dataset(name="embeddings", data=self.embedding_cache)
    g.create_dataset(name="docs", data=self.doc_cache, dtype=str)
    if self.umap_embeddings is not None:
        g.create_dataset(name="umap_embeddings", data=self.umap_embeddings)
    if self.ann is not None:
        joblib.dump(self.ann, str(path / "ann_index.zstd"), compress=("zstd", 5))
    return g

Errors

IndexNotBuiltError

Bases: ValueError, AttributeError

Exception raised when attempting to find nearest neighbors before the ANN index is built.

Source code in src/embuddy/errors.py
1
2
3
4
5
6
class IndexNotBuiltError(ValueError, AttributeError):
    """Exception raised when attempting to find nearest neighbors
    before the ANN index is built.
    """

    pass

NNDescentHyperplaneError

Bases: Exception

Exception raised when NNDescent can't find a hyperplane.

Usually occurs with small data.

Source code in src/embuddy/errors.py
 9
10
11
12
13
14
15
class NNDescentHyperplaneError(Exception):
    """Exception raised when NNDescent can't find a hyperplane.

    Usually occurs with small data.
    """

    pass