Add the configured prefix (e.g. ##) when decoding suffix tokens at the start

kudima03 · web-flow · commit c4e6f1b7b3e6 · 2025-03-03T15:20:19.000+01:00
* Fixes an issue where suffix tokens appearing at the start of a decoded string lost their prefix. * Added initializing of ContinuingSubwordPrefix from tokenizer.json * Test DecodeStartingFromSuffix changed beacuse of new behaviour. * Revert "Added initializing of ContinuingSubwordPrefix from tokenizer.json" This reverts commit d312979. * Revert "Fixes an issue where suffix tokens appearing at the start of a decoded string lost their prefix." This reverts commit f43f344. * Added decoder suffix and its initialization from tokenizer.json. * Set _decoderPrefix for vocab.txt --------- Co-authored-by: Georg Jung <git@gjung.com> Fixes #102
diff --git a/src/FastBertTokenizer.Tests/Decode.cs b/src/FastBertTokenizer.Tests/Decode.cs
@@ -49,7 +49,7 @@ public void DecodeStartingFromSuffix()
 
         long[] startsWithSuffix = loremIpsum.AsSpan(2).ToArray();
         decoded = _uut.Decode(startsWithSuffix);
-        decoded.ShouldStartWith("m ipsum");
+        decoded.ShouldStartWith("##m ipsum");
     }
 
     [Fact]
diff --git a/src/FastBertTokenizer/BertTokenizer.Decode.cs b/src/FastBertTokenizer/BertTokenizer.Decode.cs
@@ -31,6 +31,7 @@ public string Decode(ReadOnlySpan<long> tokenIds, bool cleanupTokenizationSpaces
         else
         {
             // Our decoded text does not start with a word start but in the middle of a word.
+            sb.Append(_decoderPrefix);
             sb.Append(_decodeSuffixes[tokenIds[0]]);
         }
 
diff --git a/src/FastBertTokenizer/BertTokenizer.LoadTokenizerJson.cs b/src/FastBertTokenizer/BertTokenizer.LoadTokenizerJson.cs
@@ -174,6 +174,7 @@ void HandleLine(string line, int tokenId)
         // https://huggingface.co/docs/tokenizers/python/latest/api/reference.html#module-tokenizers.normalizers
         // strip_accents (bool, optional) – Whether to strip all accents. If this option is not specified (ie == None), then it will be determined by the value for lowercase (as in the original Bert).
         _stripAccents = tok.Normalizer.StripAccents ?? _lowercaseInput;
+        _decoderPrefix = tok.Decoder?.Prefix ?? "##";
         _normalization = normalization;
         _unk = (unkId ?? throw new InvalidOperationException($"Vocabulary does not contain unknown token {unkToken}."), unkToken);
         _cls = (clsId ?? throw new InvalidOperationException($"Vocabulary does not contain cls token {clsToken}."), clsToken);
diff --git a/src/FastBertTokenizer/BertTokenizer.LoadVocab.cs b/src/FastBertTokenizer/BertTokenizer.LoadVocab.cs
@@ -11,6 +11,8 @@ namespace FastBertTokenizer;
 
 public partial class BertTokenizer
 {
+    private const string VocabTxtDefaultContinuingSubwordPrefix = "##";
+
     /// <summary>
     /// Load a vocab.txt file that assigns an id to each token based on the line number.
     /// </summary>
@@ -75,7 +77,7 @@ void HandleLine(string line)
         {
             if (!string.IsNullOrEmpty(line))
             {
-                if (line.StartsWith("##", StringComparison.Ordinal))
+                if (line.StartsWith(VocabTxtDefaultContinuingSubwordPrefix, StringComparison.Ordinal))
                 {
                     suffixes[new StringSpanOrdinalKey(line[2..])] = i;
                 }
@@ -149,6 +151,7 @@ void Finish()
             _suffixes = suffixes;
 #endif
             _lowercaseInput = convertInputToLowercase;
+            _decoderPrefix = VocabTxtDefaultContinuingSubwordPrefix;
             _normalization = normalization;
             _addedTokens = new([(unknownToken, false), (clsToken, false), (sepToken, false), (padToken, false)]);
         }
diff --git a/src/FastBertTokenizer/BertTokenizer.cs b/src/FastBertTokenizer/BertTokenizer.cs
@@ -35,6 +35,7 @@ public partial class BertTokenizer
     private bool _stripAccents = true;
     private NormalizationForm _normalization;
     private AddedTokens _addedTokens = default!;
+    private string _decoderPrefix = default!;
 
     // These will just be used if the consumer calls an API that _returns_ ReadOnlyMemory.
     // They will be reused for subsequent calls to avoid allocations.
diff --git a/src/FastBertTokenizer/TokenizerJson.cs b/src/FastBertTokenizer/TokenizerJson.cs
@@ -25,6 +25,8 @@ internal record TokenizerJson
 
     public required ModelSection Model { get; init; }
 
+    public DecoderSection? Decoder { get; init; }
+
     // https://huggingface.co/docs/tokenizers/python/latest/api/reference.html#tokenizers.AddedToken
     internal record AddedToken
     {
@@ -91,4 +93,13 @@ internal record ModelSection
 
         public required Dictionary<string, int> Vocab { get; set; }
     }
+
+    internal record DecoderSection
+    {
+        public string? Type { get; init; }
+
+        public string? Prefix { get; init; }
+
+        public bool? Cleanup { get; init; }
+    }
 }

Original file line number	Diff line number	Diff line change
`@@ -49,7 +49,7 @@ public void DecodeStartingFromSuffix()`
`49`	`49`
`50`	`50`	`long[] startsWithSuffix = loremIpsum.AsSpan(2).ToArray();`
`51`	`51`	`decoded = _uut.Decode(startsWithSuffix);`
`52`		`- decoded.ShouldStartWith("m ipsum");`
	`52`	`+ decoded.ShouldStartWith("##m ipsum");`
`53`	`53`	`}`
`54`	`54`
`55`	`55`	`[Fact]`
Original file line number	Diff line number	Diff line change
`@@ -31,6 +31,7 @@ public string Decode(ReadOnlySpan<long> tokenIds, bool cleanupTokenizationSpaces`
`31`	`31`	`else`
`32`	`32`	`{`
`33`	`33`	`// Our decoded text does not start with a word start but in the middle of a word.`
	`34`	`+ sb.Append(_decoderPrefix);`
`34`	`35`	`sb.Append(_decodeSuffixes[tokenIds[0]]);`
`35`	`36`	`}`
`36`	`37`
Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,8 @@ namespace FastBertTokenizer;`
`11`	`11`
`12`	`12`	`public partial class BertTokenizer`
`13`	`13`	`{`
	`14`	`+ private const string VocabTxtDefaultContinuingSubwordPrefix = "##";`
	`15`	`+`
`14`	`16`	`/// <summary>`
`15`	`17`	`/// Load a vocab.txt file that assigns an id to each token based on the line number.`
`16`	`18`	`/// </summary>`
`@@ -75,7 +77,7 @@ void HandleLine(string line)`
`75`	`77`	`{`
`76`	`78`	`if (!string.IsNullOrEmpty(line))`
`77`	`79`	`{`
`78`		`- if (line.StartsWith("##", StringComparison.Ordinal))`
	`80`	`+ if (line.StartsWith(VocabTxtDefaultContinuingSubwordPrefix, StringComparison.Ordinal))`
`79`	`81`	`{`
`80`	`82`	`suffixes[new StringSpanOrdinalKey(line[2..])] = i;`
`81`	`83`	`}`
`@@ -149,6 +151,7 @@ void Finish()`
`149`	`151`	`_suffixes = suffixes;`
`150`	`152`	`#endif`
`151`	`153`	`_lowercaseInput = convertInputToLowercase;`
	`154`	`+ _decoderPrefix = VocabTxtDefaultContinuingSubwordPrefix;`
`152`	`155`	`_normalization = normalization;`
`153`	`156`	`_addedTokens = new([(unknownToken, false), (clsToken, false), (sepToken, false), (padToken, false)]);`
`154`	`157`	`}`
Original file line number	Diff line number	Diff line change
`@@ -25,6 +25,8 @@ internal record TokenizerJson`
`25`	`25`
`26`	`26`	`public required ModelSection Model { get; init; }`
`27`	`27`
	`28`	`+ public DecoderSection? Decoder { get; init; }`
	`29`	`+`
`28`	`30`	`// https://huggingface.co/docs/tokenizers/python/latest/api/reference.html#tokenizers.AddedToken`
`29`	`31`	`internal record AddedToken`
`30`	`32`	`{`
`@@ -91,4 +93,13 @@ internal record ModelSection`
`91`	`93`
`92`	`94`	`public required Dictionary<string, int> Vocab { get; set; }`
`93`	`95`	`}`
	`96`	`+`
	`97`	`+ internal record DecoderSection`
	`98`	`+ {`
	`99`	`+ public string? Type { get; init; }`
	`100`	`+`
	`101`	`+ public string? Prefix { get; init; }`
	`102`	`+`
	`103`	`+ public bool? Cleanup { get; init; }`
	`104`	`+ }`
`94`	`105`	`}`