Skip to content

Commit 442f699

Browse files
committed
Add best_jaccard_substr
1 parent c21c76e commit 442f699

File tree

3 files changed

+55
-0
lines changed

3 files changed

+55
-0
lines changed

fingerprint.cpp

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ Fingerprint getFingerprint(string text)
1313
for (int j = 0; j < N_GREEK_LETTERS; ++j) {
1414
f.m_data[i][j] = 0;
1515
}
16+
if (text.empty()) {
17+
return f; // return here, otherwise text.length() - 1 will be 2^64-1
18+
}
1619
for (int i = 0; i < text.length() - 1; ++i) {
1720
char first = text.at(i) - 'a';
1821
char second = text.at(i + 1) - 'a';
@@ -94,3 +97,48 @@ double jaccard_dist(const string &text1, const string &text2)
9497
*/
9598
return 1 - ((double) d) / max(d1, d2);
9699
}
100+
101+
string best_jaccard_substr(const string &fixtext, const string &subtext)
102+
{
103+
Fingerprint f1 = getFingerprint(fixtext);
104+
int maxlen = subtext.length();
105+
double best = 1.0;
106+
int best_s1, best_s2;
107+
int d1 = fixtext.length() - 1;
108+
for (int s1 = 0; s1 < maxlen; s1++) {
109+
Fingerprint f2 = getFingerprint(""); // empty
110+
111+
// When starting a new value of s1, these should be reset:
112+
int d2 = 0;
113+
int d = 0;
114+
115+
for (int s2 = s1 + 1; s2 + 1 < maxlen; s2++) { // update fingerprint for subtext
116+
char first = subtext.at(s2) - 'a';
117+
char second = subtext.at(s2 + 1) - 'a';
118+
(f2.m_data[first][second])++;
119+
120+
d2++;
121+
// If the fix text has more (or equally many) occurrences for this 2-shingle than the
122+
// currently considered one from the subtext, then the minimum of
123+
// them has just been increased by 1, so we increase d as well:
124+
if (f1.m_data[first][second] >= f2.m_data[first][second]) d++;
125+
// Otherwise, the currently considered 2-shingle of the subtext is already
126+
// greater than the occurrences in the fix text, so the minimum of them
127+
// remains the same number: no change should be done for d.
128+
129+
double dist = 1 - ((double) d) / max(d1, d2);
130+
if (dist < best) {
131+
// std::cout << "d1=" << d1 << " d2=" << d2 << " d=" << d << std::endl;
132+
best = dist;
133+
best_s1 = s1;
134+
best_s2 = s2;
135+
// printf("best=%4.2f best_s1=%d best_s2=%d\n", best, best_s1, best_s2);
136+
}
137+
}
138+
}
139+
return subtext.substr(best_s1, best_s2 - best_s1 + 2);
140+
}
141+
// Example call:
142+
// best_jaccard_substr("panarsendianoigonmhtranagiontvkyrivklhuhsetai",
143+
// "kaiafeleispandianoigonmhtrantaarsenikatvkyrivpandianoigonmhtranektvnboykolivnhentoiskthnesinsoyosaeangenhtaisoitaarsenikaagiaseistvkyriv")
144+
// should return "kaiafeleispandianoigonmhtrantaarsenikatvkyriv" (taken from Luke 2:23 and Exodus 13:12)

fingerprint.dox

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,9 @@ void printDist(Fingerprint f1, Fingerprint f2);
3333
/// Compute the Jaccard similirity for bags. See Leskovec-Rajamaran-Ullman: Mining of massive datasets,
3434
/// Cambridge University Press, 2014, p. 77-78 (see footnote 2 on page 77).
3535
double jaccard_dist(const string& text1, const string& text2);
36+
37+
// Find a substring of a text that has the minimal Jaccard similarity for bag for a fixed text.
38+
string best_jaccard_substr(const string &fixtext, const string &subtext) {
39+
/// @param fixtext The fixed text.
40+
/// @param subtext The text for which a substring must be found.
41+
};

fingerprint.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,6 @@ int dist(Fingerprint f1, Fingerprint f2);
1717
int dist(const string &text1, const string &text2);
1818
void printDist(Fingerprint f1, Fingerprint f2);
1919
double jaccard_dist(const string &text1, const string &text2);
20+
string best_jaccard_substr(const string &fixtext, const string &subtext);
2021

2122
#endif // FINGERPRINT_H

0 commit comments

Comments
 (0)