Skip to content

Commit 1a241ca

Browse files
authored
[MergeDups] Improve duplicate-finder efficiency (#3746)
1 parent 6ed1ef7 commit 1a241ca

File tree

1 file changed

+43
-84
lines changed

1 file changed

+43
-84
lines changed

Backend/Helper/DuplicateFinder.cs

+43-84
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
using System.Linq;
44
using System.Threading.Tasks;
55
using BackendFramework.Models;
6+
using SIL.Extensions;
67

78
namespace BackendFramework.Helper
89
{
@@ -56,6 +57,24 @@ public async Task<List<List<Word>>> GetIdenticalVernWords(
5657
return wordLists;
5758
}
5859

60+
/// <summary> Get the earliest available sublist of the given word-list and its similarity score. </summary>
61+
private async Task<Tuple<double, List<Word>>> ModifiedScore(double scoreToBeat, List<Tuple<double, Word>> similarWords,
62+
List<string> unavailableIds, Func<List<string>, Task<bool>> isUnavailableSet)
63+
{
64+
var trimmed = similarWords.Where(tuple => !unavailableIds.Contains(tuple.Item2.Id)).ToList();
65+
var ids = trimmed.Select(tuple => tuple.Item2.Id).ToList();
66+
while (ids.Count > 1 && trimmed.ElementAt(1).Item1 < scoreToBeat &&
67+
await isUnavailableSet(ids[..Math.Min(ids.Count, _maxInList)]))
68+
{
69+
// If the initial sublist is unavailable, remove the second element and try again.
70+
// (The first element is the one against which all the similarity scores were computed.)
71+
trimmed.RemoveAt(1);
72+
ids.RemoveAt(1);
73+
}
74+
var words = trimmed[..Math.Min(trimmed.Count, _maxInList)].Select(tuple => tuple.Item2).ToList();
75+
return Tuple.Create(words.Count < 2 ? _maxScore + 1.0 : trimmed.ElementAt(1).Item1, words);
76+
}
77+
5978
/// <summary> Get from specified List several sub-Lists, each a set of similar <see cref="Word"/>s. </summary>
6079
/// <returns>
6180
/// A List of Lists: each inner list is ordered by similarity to the first entry in the List;
@@ -64,64 +83,32 @@ public async Task<List<List<Word>>> GetIdenticalVernWords(
6483
public async Task<List<List<Word>>> GetSimilarWords(
6584
List<Word> collection, Func<List<string>, Task<bool>> isUnavailableSet)
6685
{
67-
double currentMax = _maxScore;
68-
var wordLists = new List<Tuple<double, List<Word>>> { Capacity = _maxLists + 1 };
69-
while (collection.Count > 0 && (wordLists.Count < _maxLists || currentMax > 0))
70-
{
71-
var word = collection.First();
72-
collection.RemoveAt(0);
73-
var similarWords = GetSimilarToWord(word, collection);
74-
if (similarWords.Count == 0)
75-
{
76-
continue;
77-
}
78-
var score = similarWords.First().Item1;
79-
if (score > currentMax || (wordLists.Count >= _maxLists && Math.Abs(score - currentMax) < 0.001))
80-
{
81-
continue;
82-
}
83-
84-
// Check if set is in blacklist or graylist.
85-
var ids = new List<string> { word.Id };
86-
ids.AddRange(similarWords.Select(w => w.Item2.Id));
87-
if (await isUnavailableSet(ids))
88-
{
89-
continue;
90-
}
91-
92-
// Remove similar words from collection.
93-
var idsToRemove = similarWords.Select(w => w.Item2.Id);
94-
collection.RemoveAll(w => idsToRemove.Contains(w.Id));
86+
var similarWordsLists = collection.AsParallel()
87+
.Select(w => GetSimilarToWord(w, collection))
88+
.Where(wl => wl.Count > 1).ToList();
9589

96-
// Add similar words to list with main word.
97-
var newWordList = Tuple.Create(score, new List<Word> { word });
98-
newWordList.Item2.AddRange(similarWords.Select(w => w.Item2));
90+
var best = new List<List<Word>>();
91+
var bestIds = new List<string>();
9992

100-
// Insert at correct place in list.
101-
var i = wordLists.FindIndex(pair => score <= pair.Item1);
102-
if (i == -1)
103-
{
104-
wordLists.Add(newWordList);
105-
}
106-
else
107-
{
108-
wordLists.Insert(i, newWordList);
109-
}
110-
111-
// If list is now too long, boot the last one, recycling its similar words.
112-
if (wordLists.Count == _maxLists + 1)
93+
while (best.Count < similarWordsLists.Count && best.Count < _maxLists)
94+
{
95+
var candidate = Tuple.Create(_maxScore + double.Epsilon, new List<Word>());
96+
for (var i = 0; i < similarWordsLists.Count; i++)
11397
{
114-
var toRecycle = wordLists.Last().Item2;
115-
toRecycle.RemoveAt(0);
116-
foreach (var simWord in toRecycle)
98+
var temp = await ModifiedScore(candidate.Item1, similarWordsLists[i], bestIds, isUnavailableSet);
99+
if (temp.Item1 < candidate.Item1)
117100
{
118-
collection.Add(simWord);
101+
candidate = temp;
119102
}
120-
wordLists.RemoveAt(_maxLists);
121-
currentMax = wordLists.Last().Item1;
122103
}
104+
if (candidate.Item2.Count == 0)
105+
{
106+
break;
107+
}
108+
best.Add(candidate.Item2);
109+
bestIds.AddRange(candidate.Item2.Select(w => w.Id));
123110
}
124-
return wordLists.Select(list => list.Item2).ToList();
111+
return best;
125112
}
126113

127114
/// <summary> Get from specified List a sub-List with same vern as specified <see cref="Word"/>. </summary>
@@ -142,49 +129,21 @@ private List<Word> GetIdenticalVernToWord(Word word, List<Word> collection)
142129
return identicalWords;
143130
}
144131

145-
/// <summary> Get from specified List a sublist of elements similar to specified <see cref="Word"/>. </summary>
132+
/// <summary> Get all elements in specified List that are similar to specified <see cref="Word"/>. </summary>
146133
/// <returns> List of similar <see cref="Word"/>s, ordered by similarity with most similar first. </returns>
147134
private List<Tuple<double, Word>> GetSimilarToWord(Word word, List<Word> collection)
148135
{
149-
// If the number of similar words exceeds the max allowable (i.e., .Count = _maxInList),
150-
// then the currentMaxScore will be decreased.
151-
var similarWords = new List<Tuple<double, Word>> { Capacity = _maxInList };
152-
double currentMaxScore = _maxScore;
153-
136+
var similarWords = new List<Tuple<double, Word>>();
154137
foreach (var other in collection)
155138
{
156139
// Add the word if the score is low enough.
157140
var score = GetWordScore(word, other);
158-
if (score > currentMaxScore || (similarWords.Count >= _maxInList - 1 && score >= currentMaxScore))
141+
if (score <= _maxScore)
159142
{
160-
continue;
161-
}
162-
163-
// Insert at correct place in List.
164-
var i = similarWords.FindIndex(pair => score <= pair.Item1);
165-
var newWord = Tuple.Create(score, other.Clone());
166-
if (i == -1)
167-
{
168-
similarWords.Add(newWord);
169-
}
170-
else
171-
{
172-
similarWords.Insert(i, newWord);
173-
}
174-
175-
// Check if list is now 1 too large.
176-
if (similarWords.Count == _maxInList)
177-
{
178-
similarWords.RemoveAt(_maxInList - 1);
179-
currentMaxScore = similarWords.Last().Item1;
180-
}
181-
182-
// If we've maxed out with identical words, stop.
183-
if (similarWords.Count == _maxInList - 1 && similarWords.Last().Item1 == 0)
184-
{
185-
break;
143+
similarWords.Add(Tuple.Create(score, other));
186144
}
187145
}
146+
similarWords.Sort((x, y) => x.Item1.CompareTo(y.Item1));
188147
return similarWords;
189148
}
190149

0 commit comments

Comments
 (0)