Skip to content

Commit

Permalink
Add EmbeddingInfo NamedTuple (#392)
Browse files Browse the repository at this point in the history
Co-authored-by: Ben Shapira <[email protected]>
  • Loading branch information
bensha6757 and Ben Shapira authored Jan 22, 2025
1 parent 8cfd176 commit ab66613
Showing 1 changed file with 10 additions and 2 deletions.
12 changes: 10 additions & 2 deletions fuse/data/tokenizers/modular_tokenizer/inject_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import re
from collections import defaultdict
from typing import Dict, List, Optional, Tuple, Union
from typing import Dict, List, NamedTuple, Optional, Tuple, Union
from warnings import warn

import torch
Expand All @@ -13,6 +13,11 @@
from fuse.utils import NDict


class EmbeddingInfo(NamedTuple):
location: int
embedding_input: str


class InjectorToModularTokenizerLib:
"""
InjectorTokenizer builds on top of ModularTokenizer.
Expand Down Expand Up @@ -227,7 +232,10 @@ def build_scalars_and_embeddings(

embedding_input = sample_dict[embeddings_key]
external_embeddings_info[embedding_model_name].append(
(num_tokens_token_so_far, embedding_input)
EmbeddingInfo(
location=num_tokens_token_so_far,
embedding_input=embedding_input,
)
)
elif tokenizer_name.startswith("VECTORS_"):
raise NotImplementedError
Expand Down

0 comments on commit ab66613

Please sign in to comment.