@@ -13,6 +13,9 @@ Fingerprint getFingerprint(string text)
13
13
for (int j = 0 ; j < N_GREEK_LETTERS; ++j) {
14
14
f.m_data [i][j] = 0 ;
15
15
}
16
+ if (text.empty ()) {
17
+ return f; // return here, otherwise text.length() - 1 will be 2^64-1
18
+ }
16
19
for (int i = 0 ; i < text.length () - 1 ; ++i) {
17
20
char first = text.at (i) - ' a' ;
18
21
char second = text.at (i + 1 ) - ' a' ;
@@ -94,3 +97,48 @@ double jaccard_dist(const string &text1, const string &text2)
94
97
*/
95
98
return 1 - ((double ) d) / max (d1, d2);
96
99
}
100
+
101
+ string best_jaccard_substr (const string &fixtext, const string &subtext)
102
+ {
103
+ Fingerprint f1 = getFingerprint (fixtext);
104
+ int maxlen = subtext.length ();
105
+ double best = 1.0 ;
106
+ int best_s1, best_s2;
107
+ int d1 = fixtext.length () - 1 ;
108
+ for (int s1 = 0 ; s1 < maxlen; s1++) {
109
+ Fingerprint f2 = getFingerprint (" " ); // empty
110
+
111
+ // When starting a new value of s1, these should be reset:
112
+ int d2 = 0 ;
113
+ int d = 0 ;
114
+
115
+ for (int s2 = s1 + 1 ; s2 + 1 < maxlen; s2++) { // update fingerprint for subtext
116
+ char first = subtext.at (s2) - ' a' ;
117
+ char second = subtext.at (s2 + 1 ) - ' a' ;
118
+ (f2.m_data [first][second])++;
119
+
120
+ d2++;
121
+ // If the fix text has more (or equally many) occurrences for this 2-shingle than the
122
+ // currently considered one from the subtext, then the minimum of
123
+ // them has just been increased by 1, so we increase d as well:
124
+ if (f1.m_data [first][second] >= f2.m_data [first][second]) d++;
125
+ // Otherwise, the currently considered 2-shingle of the subtext is already
126
+ // greater than the occurrences in the fix text, so the minimum of them
127
+ // remains the same number: no change should be done for d.
128
+
129
+ double dist = 1 - ((double ) d) / max (d1, d2);
130
+ if (dist < best) {
131
+ // std::cout << "d1=" << d1 << " d2=" << d2 << " d=" << d << std::endl;
132
+ best = dist;
133
+ best_s1 = s1;
134
+ best_s2 = s2;
135
+ // printf("best=%4.2f best_s1=%d best_s2=%d\n", best, best_s1, best_s2);
136
+ }
137
+ }
138
+ }
139
+ return subtext.substr (best_s1, best_s2 - best_s1 + 2 );
140
+ }
141
+ // Example call:
142
+ // best_jaccard_substr("panarsendianoigonmhtranagiontvkyrivklhuhsetai",
143
+ // "kaiafeleispandianoigonmhtrantaarsenikatvkyrivpandianoigonmhtranektvnboykolivnhentoiskthnesinsoyosaeangenhtaisoitaarsenikaagiaseistvkyriv")
144
+ // should return "kaiafeleispandianoigonmhtrantaarsenikatvkyriv" (taken from Luke 2:23 and Exodus 13:12)
0 commit comments