3
3
using System . Linq ;
4
4
using System . Threading . Tasks ;
5
5
using BackendFramework . Models ;
6
+ using SIL . Extensions ;
6
7
7
8
namespace BackendFramework . Helper
8
9
{
@@ -56,6 +57,24 @@ public async Task<List<List<Word>>> GetIdenticalVernWords(
56
57
return wordLists ;
57
58
}
58
59
60
+ /// <summary> Get the earliest available sublist of the given word-list and its similarity score. </summary>
61
+ private async Task < Tuple < double , List < Word > > > ModifiedScore ( double scoreToBeat , List < Tuple < double , Word > > similarWords ,
62
+ List < string > unavailableIds , Func < List < string > , Task < bool > > isUnavailableSet )
63
+ {
64
+ var trimmed = similarWords . Where ( tuple => ! unavailableIds . Contains ( tuple . Item2 . Id ) ) . ToList ( ) ;
65
+ var ids = trimmed . Select ( tuple => tuple . Item2 . Id ) . ToList ( ) ;
66
+ while ( ids . Count > 1 && trimmed . ElementAt ( 1 ) . Item1 < scoreToBeat &&
67
+ await isUnavailableSet ( ids [ ..Math . Min ( ids . Count , _maxInList ) ] ) )
68
+ {
69
+ // If the initial sublist is unavailable, remove the second element and try again.
70
+ // (The first element is the one against which all the similarity scores were computed.)
71
+ trimmed . RemoveAt ( 1 ) ;
72
+ ids . RemoveAt ( 1 ) ;
73
+ }
74
+ var words = trimmed [ ..Math . Min ( trimmed . Count , _maxInList ) ] . Select ( tuple => tuple . Item2 ) . ToList ( ) ;
75
+ return Tuple . Create ( words . Count < 2 ? _maxScore + 1.0 : trimmed . ElementAt ( 1 ) . Item1 , words ) ;
76
+ }
77
+
59
78
/// <summary> Get from specified List several sub-Lists, each a set of similar <see cref="Word"/>s. </summary>
60
79
/// <returns>
61
80
/// A List of Lists: each inner list is ordered by similarity to the first entry in the List;
@@ -64,64 +83,32 @@ public async Task<List<List<Word>>> GetIdenticalVernWords(
64
83
public async Task < List < List < Word > > > GetSimilarWords (
65
84
List < Word > collection , Func < List < string > , Task < bool > > isUnavailableSet )
66
85
{
67
- double currentMax = _maxScore ;
68
- var wordLists = new List < Tuple < double , List < Word > > > { Capacity = _maxLists + 1 } ;
69
- while ( collection . Count > 0 && ( wordLists . Count < _maxLists || currentMax > 0 ) )
70
- {
71
- var word = collection . First ( ) ;
72
- collection . RemoveAt ( 0 ) ;
73
- var similarWords = GetSimilarToWord ( word , collection ) ;
74
- if ( similarWords . Count == 0 )
75
- {
76
- continue ;
77
- }
78
- var score = similarWords . First ( ) . Item1 ;
79
- if ( score > currentMax || ( wordLists . Count >= _maxLists && Math . Abs ( score - currentMax ) < 0.001 ) )
80
- {
81
- continue ;
82
- }
83
-
84
- // Check if set is in blacklist or graylist.
85
- var ids = new List < string > { word . Id } ;
86
- ids . AddRange ( similarWords . Select ( w => w . Item2 . Id ) ) ;
87
- if ( await isUnavailableSet ( ids ) )
88
- {
89
- continue ;
90
- }
91
-
92
- // Remove similar words from collection.
93
- var idsToRemove = similarWords . Select ( w => w . Item2 . Id ) ;
94
- collection . RemoveAll ( w => idsToRemove . Contains ( w . Id ) ) ;
86
+ var similarWordsLists = collection . AsParallel ( )
87
+ . Select ( w => GetSimilarToWord ( w , collection ) )
88
+ . Where ( wl => wl . Count > 1 ) . ToList ( ) ;
95
89
96
- // Add similar words to list with main word.
97
- var newWordList = Tuple . Create ( score , new List < Word > { word } ) ;
98
- newWordList . Item2 . AddRange ( similarWords . Select ( w => w . Item2 ) ) ;
90
+ var best = new List < List < Word > > ( ) ;
91
+ var bestIds = new List < string > ( ) ;
99
92
100
- // Insert at correct place in list.
101
- var i = wordLists . FindIndex ( pair => score <= pair . Item1 ) ;
102
- if ( i == - 1 )
103
- {
104
- wordLists . Add ( newWordList ) ;
105
- }
106
- else
107
- {
108
- wordLists . Insert ( i , newWordList ) ;
109
- }
110
-
111
- // If list is now too long, boot the last one, recycling its similar words.
112
- if ( wordLists . Count == _maxLists + 1 )
93
+ while ( best . Count < similarWordsLists . Count && best . Count < _maxLists )
94
+ {
95
+ var candidate = Tuple . Create ( _maxScore + double . Epsilon , new List < Word > ( ) ) ;
96
+ for ( var i = 0 ; i < similarWordsLists . Count ; i ++ )
113
97
{
114
- var toRecycle = wordLists . Last ( ) . Item2 ;
115
- toRecycle . RemoveAt ( 0 ) ;
116
- foreach ( var simWord in toRecycle )
98
+ var temp = await ModifiedScore ( candidate . Item1 , similarWordsLists [ i ] , bestIds , isUnavailableSet ) ;
99
+ if ( temp . Item1 < candidate . Item1 )
117
100
{
118
- collection . Add ( simWord ) ;
101
+ candidate = temp ;
119
102
}
120
- wordLists . RemoveAt ( _maxLists ) ;
121
- currentMax = wordLists . Last ( ) . Item1 ;
122
103
}
104
+ if ( candidate . Item2 . Count == 0 )
105
+ {
106
+ break ;
107
+ }
108
+ best . Add ( candidate . Item2 ) ;
109
+ bestIds . AddRange ( candidate . Item2 . Select ( w => w . Id ) ) ;
123
110
}
124
- return wordLists . Select ( list => list . Item2 ) . ToList ( ) ;
111
+ return best ;
125
112
}
126
113
127
114
/// <summary> Get from specified List a sub-List with same vern as specified <see cref="Word"/>. </summary>
@@ -142,49 +129,21 @@ private List<Word> GetIdenticalVernToWord(Word word, List<Word> collection)
142
129
return identicalWords ;
143
130
}
144
131
145
- /// <summary> Get from specified List a sublist of elements similar to specified <see cref="Word"/>. </summary>
132
+ /// <summary> Get all elements in specified List that are similar to specified <see cref="Word"/>. </summary>
146
133
/// <returns> List of similar <see cref="Word"/>s, ordered by similarity with most similar first. </returns>
147
134
private List < Tuple < double , Word > > GetSimilarToWord ( Word word , List < Word > collection )
148
135
{
149
- // If the number of similar words exceeds the max allowable (i.e., .Count = _maxInList),
150
- // then the currentMaxScore will be decreased.
151
- var similarWords = new List < Tuple < double , Word > > { Capacity = _maxInList } ;
152
- double currentMaxScore = _maxScore ;
153
-
136
+ var similarWords = new List < Tuple < double , Word > > ( ) ;
154
137
foreach ( var other in collection )
155
138
{
156
139
// Add the word if the score is low enough.
157
140
var score = GetWordScore ( word , other ) ;
158
- if ( score > currentMaxScore || ( similarWords . Count >= _maxInList - 1 && score >= currentMaxScore ) )
141
+ if ( score <= _maxScore )
159
142
{
160
- continue ;
161
- }
162
-
163
- // Insert at correct place in List.
164
- var i = similarWords . FindIndex ( pair => score <= pair . Item1 ) ;
165
- var newWord = Tuple . Create ( score , other . Clone ( ) ) ;
166
- if ( i == - 1 )
167
- {
168
- similarWords . Add ( newWord ) ;
169
- }
170
- else
171
- {
172
- similarWords . Insert ( i , newWord ) ;
173
- }
174
-
175
- // Check if list is now 1 too large.
176
- if ( similarWords . Count == _maxInList )
177
- {
178
- similarWords . RemoveAt ( _maxInList - 1 ) ;
179
- currentMaxScore = similarWords . Last ( ) . Item1 ;
180
- }
181
-
182
- // If we've maxed out with identical words, stop.
183
- if ( similarWords . Count == _maxInList - 1 && similarWords . Last ( ) . Item1 == 0 )
184
- {
185
- break ;
143
+ similarWords . Add ( Tuple . Create ( score , other ) ) ;
186
144
}
187
145
}
146
+ similarWords . Sort ( ( x , y ) => x . Item1 . CompareTo ( y . Item1 ) ) ;
188
147
return similarWords ;
189
148
}
190
149
0 commit comments