Skip to content

Commit c4e6f1b

Browse files
authored
Add the configured prefix (e.g. ##) when decoding suffix tokens at the start
* Fixes an issue where suffix tokens appearing at the start of a decoded string lost their prefix. * Added initializing of ContinuingSubwordPrefix from tokenizer.json * Test DecodeStartingFromSuffix changed beacuse of new behaviour. * Revert "Added initializing of ContinuingSubwordPrefix from tokenizer.json" This reverts commit d312979. * Revert "Fixes an issue where suffix tokens appearing at the start of a decoded string lost their prefix." This reverts commit f43f344. * Added decoder suffix and its initialization from tokenizer.json. * Set _decoderPrefix for vocab.txt --------- Co-authored-by: Georg Jung <[email protected]> Fixes #102
1 parent 627b1a0 commit c4e6f1b

File tree

6 files changed

+19
-2
lines changed

6 files changed

+19
-2
lines changed

src/FastBertTokenizer.Tests/Decode.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ public void DecodeStartingFromSuffix()
4949

5050
long[] startsWithSuffix = loremIpsum.AsSpan(2).ToArray();
5151
decoded = _uut.Decode(startsWithSuffix);
52-
decoded.ShouldStartWith("m ipsum");
52+
decoded.ShouldStartWith("##m ipsum");
5353
}
5454

5555
[Fact]

src/FastBertTokenizer/BertTokenizer.Decode.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ public string Decode(ReadOnlySpan<long> tokenIds, bool cleanupTokenizationSpaces
3131
else
3232
{
3333
// Our decoded text does not start with a word start but in the middle of a word.
34+
sb.Append(_decoderPrefix);
3435
sb.Append(_decodeSuffixes[tokenIds[0]]);
3536
}
3637

src/FastBertTokenizer/BertTokenizer.LoadTokenizerJson.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ void HandleLine(string line, int tokenId)
174174
// https://huggingface.co/docs/tokenizers/python/latest/api/reference.html#module-tokenizers.normalizers
175175
// strip_accents (bool, optional) – Whether to strip all accents. If this option is not specified (ie == None), then it will be determined by the value for lowercase (as in the original Bert).
176176
_stripAccents = tok.Normalizer.StripAccents ?? _lowercaseInput;
177+
_decoderPrefix = tok.Decoder?.Prefix ?? "##";
177178
_normalization = normalization;
178179
_unk = (unkId ?? throw new InvalidOperationException($"Vocabulary does not contain unknown token {unkToken}."), unkToken);
179180
_cls = (clsId ?? throw new InvalidOperationException($"Vocabulary does not contain cls token {clsToken}."), clsToken);

src/FastBertTokenizer/BertTokenizer.LoadVocab.cs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ namespace FastBertTokenizer;
1111

1212
public partial class BertTokenizer
1313
{
14+
private const string VocabTxtDefaultContinuingSubwordPrefix = "##";
15+
1416
/// <summary>
1517
/// Load a vocab.txt file that assigns an id to each token based on the line number.
1618
/// </summary>
@@ -75,7 +77,7 @@ void HandleLine(string line)
7577
{
7678
if (!string.IsNullOrEmpty(line))
7779
{
78-
if (line.StartsWith("##", StringComparison.Ordinal))
80+
if (line.StartsWith(VocabTxtDefaultContinuingSubwordPrefix, StringComparison.Ordinal))
7981
{
8082
suffixes[new StringSpanOrdinalKey(line[2..])] = i;
8183
}
@@ -149,6 +151,7 @@ void Finish()
149151
_suffixes = suffixes;
150152
#endif
151153
_lowercaseInput = convertInputToLowercase;
154+
_decoderPrefix = VocabTxtDefaultContinuingSubwordPrefix;
152155
_normalization = normalization;
153156
_addedTokens = new([(unknownToken, false), (clsToken, false), (sepToken, false), (padToken, false)]);
154157
}

src/FastBertTokenizer/BertTokenizer.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ public partial class BertTokenizer
3535
private bool _stripAccents = true;
3636
private NormalizationForm _normalization;
3737
private AddedTokens _addedTokens = default!;
38+
private string _decoderPrefix = default!;
3839

3940
// These will just be used if the consumer calls an API that _returns_ ReadOnlyMemory.
4041
// They will be reused for subsequent calls to avoid allocations.

src/FastBertTokenizer/TokenizerJson.cs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ internal record TokenizerJson
2525

2626
public required ModelSection Model { get; init; }
2727

28+
public DecoderSection? Decoder { get; init; }
29+
2830
// https://huggingface.co/docs/tokenizers/python/latest/api/reference.html#tokenizers.AddedToken
2931
internal record AddedToken
3032
{
@@ -91,4 +93,13 @@ internal record ModelSection
9193

9294
public required Dictionary<string, int> Vocab { get; set; }
9395
}
96+
97+
internal record DecoderSection
98+
{
99+
public string? Type { get; init; }
100+
101+
public string? Prefix { get; init; }
102+
103+
public bool? Cleanup { get; init; }
104+
}
94105
}

0 commit comments

Comments
 (0)