Skip to content

Commit bdc4bca

Browse files
Copilotgeorg-jung
andcommitted
Optimize performance: EmitNoSpaceBefore, AddedTokens, and Decode dictionaries
Co-authored-by: georg-jung <[email protected]>
1 parent 38a9b6d commit bdc4bca

File tree

2 files changed

+37
-13
lines changed

2 files changed

+37
-13
lines changed

src/FastBertTokenizer/AddedTokens.cs

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,22 @@ public AddedTokens(IEnumerable<(string Content, bool Normalize)> addedTokens)
1313

1414
// This logic might not be perfect. Are there chars that are equal to others in an invariant case insesitive comparison
1515
// but are neither the upper nor the lower variant of the original?
16-
var firstLettersToSearch = addedTokens
17-
.SelectMany(x => x.Normalize
18-
? (IEnumerable<char>)[x.Content[0], char.ToLowerInvariant(x.Content[0]), char.ToUpperInvariant(x.Content[0])]
19-
: [x.Content[0]])
20-
.Distinct();
16+
var firstLettersSet = new HashSet<char>();
17+
foreach (var (content, normalize) in addedTokens)
18+
{
19+
var firstChar = content[0];
20+
firstLettersSet.Add(firstChar);
21+
if (normalize)
22+
{
23+
firstLettersSet.Add(char.ToLowerInvariant(firstChar));
24+
firstLettersSet.Add(char.ToUpperInvariant(firstChar));
25+
}
26+
}
27+
2128
#if NET8_0_OR_GREATER
22-
FirstLetters = SearchValues.Create([.. firstLettersToSearch]);
29+
FirstLetters = SearchValues.Create([.. firstLettersSet]);
2330
#else
24-
FirstLetters = [.. firstLettersToSearch];
31+
FirstLetters = [.. firstLettersSet];
2532
#endif
2633
}
2734

src/FastBertTokenizer/BertTokenizer.Decode.cs

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,10 @@ public string Decode(ReadOnlySpan<long> tokenIds)
1515
_ = _prefixes ?? throw new InvalidOperationException("Vocabulary not loaded.");
1616
_ = _suffixes ?? throw new InvalidOperationException("Vocabulary not loaded.");
1717

18-
_decodeSuffixes ??= _suffixes.ToDictionary(x => x.Value, x => x.Key.ToString());
19-
_decodePrefixes ??= _prefixes.ToDictionary(x => x.Value, x => x.Key.ToString());
18+
if (_decodeSuffixes is null || _decodePrefixes is null)
19+
{
20+
InitializeDecodeDictionaries();
21+
}
2022

2123
if (tokenIds.Length == 0)
2224
{
@@ -72,9 +74,24 @@ public string Decode(ReadOnlySpan<long> tokenIds)
7274
// See https://github.com/huggingface/tokenizers/blob/daf361676bdfd14088f7e0bc087effc6a9cfdf3e/tokenizers/src/decoders/wordpiece.rs#L31
7375
private bool EmitNoSpaceBefore(string prefix)
7476
{
75-
return ".".Equals(prefix, StringComparison.Ordinal)
76-
|| "?".Equals(prefix, StringComparison.Ordinal)
77-
|| "!".Equals(prefix, StringComparison.Ordinal)
78-
|| ",".Equals(prefix, StringComparison.Ordinal);
77+
return prefix.Length == 1 && (prefix[0] == '.' || prefix[0] == '?' || prefix[0] == '!' || prefix[0] == ',');
78+
}
79+
80+
private void InitializeDecodeDictionaries()
81+
{
82+
var decodeSuffixes = new Dictionary<long, string>(_suffixes!.Count);
83+
foreach (var kvp in _suffixes!)
84+
{
85+
decodeSuffixes[kvp.Value] = kvp.Key.ToString();
86+
}
87+
88+
var decodePrefixes = new Dictionary<long, string>(_prefixes!.Count);
89+
foreach (var kvp in _prefixes!)
90+
{
91+
decodePrefixes[kvp.Value] = kvp.Key.ToString();
92+
}
93+
94+
_decodeSuffixes = decodeSuffixes;
95+
_decodePrefixes = decodePrefixes;
7996
}
8097
}

0 commit comments

Comments
 (0)