Skip to content

Commit 952c751

Browse files
authored
Update tokenizer alg to use is [.] (#6561)
1 parent a8dde6b commit 952c751

File tree

2 files changed

+4
-7
lines changed

2 files changed

+4
-7
lines changed

src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/SimpleWordTokenizer.cs

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -76,18 +76,13 @@ public static IEnumerable<string> WordTokenize(ReadOnlyMemory<char> text)
7676
}
7777

7878
// Join hyphenated words
79-
if (span[0] == '-' &&
80-
span.Length > 1 &&
81-
span[1] == '\n')
79+
if (span is ['-', '\n', ..])
8280
{
8381
text = text.Slice(2);
8482
continue;
8583
}
8684

87-
if (span[0] == '-' &&
88-
span.Length > 2 &&
89-
span[1] == '\r' &&
90-
span[2] == '\n')
85+
if (span is ['-', '\r', '\n', ..])
9186
{
9287
text = text.Slice(3);
9388
continue;

test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/SimpleTokenizerTests.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ public class SimpleTokenizerTests
2020
[InlineData("word1-word2", new[] { "WORD1", "-", "WORD2" })]
2121
[InlineData("word1 - word2", new[] { "WORD1", "-", "WORD2" })]
2222
[InlineData("word1-\n word2", new[] { "WORD1", "WORD2" })]
23+
[InlineData("word1-", new[] { "WORD1", "-" })]
24+
[InlineData("word1&", new[] { "WORD1", "&" })]
2325
[InlineData("word1-\r\n word2", new[] { "WORD1", "WORD2" })]
2426
[InlineData("word1-\r\nword2", new[] { "WORD1WORD2" })]
2527
[InlineData("word1-\nword2", new[] { "WORD1WORD2" })]

0 commit comments

Comments
 (0)