@@ -215,34 +215,6 @@ int UNICHARSET::step(const char* str) const {
215
215
if (encoding.empty () || encoding[0 ] == INVALID_UNICHAR_ID) return 0 ;
216
216
return lengths[0 ];
217
217
}
218
- // As step except constraining the search to unichar-ids that are
219
- // self-normalized. Unlike step, does not encode the whole string, therefore
220
- // should be used on short strings (like those obtained from
221
- // get_normed_unichar.)
222
- int UNICHARSET::normed_step (const char * str) const {
223
- // Find the length of the first matching unicharset member.
224
- int length = ids.minmatch (str);
225
- if (length == 0 )
226
- return 0 ; // Empty string or illegal char.
227
-
228
- while (length <= UNICHAR_LEN) {
229
- if (ids.contains (str, length)) {
230
- int matched_id = unichar_to_id (str, length);
231
- const GenericVector<UNICHAR_ID>& matched_norms = normed_ids (matched_id);
232
- bool good_start = matched_norms.size () == 1 &&
233
- matched_norms[0 ] == matched_id;
234
- if (str[length] == ' \0 ' ) {
235
- return good_start ? length : 0 ;
236
- }
237
- if (normed_step (str + length) > 0 )
238
- return length; // This length works!
239
- } else if (str[length] == ' \0 ' ) {
240
- return 0 ; // Ran out of string.
241
- }
242
- ++length;
243
- }
244
- return 0 ;
245
- }
246
218
247
219
// Return whether the given UTF-8 string is encodable with this UNICHARSET.
248
220
// If not encodable, write the first byte offset which cannot be converted
@@ -375,19 +347,13 @@ STRING UNICHARSET::debug_str(UNICHAR_ID id) const {
375
347
// stored in the file, and needs to be set when the UNICHARSET is loaded.
376
348
void UNICHARSET::set_normed_ids (UNICHAR_ID unichar_id) {
377
349
unichars[unichar_id].properties .normed_ids .truncate (0 );
378
- int length = unichars[unichar_id].properties .normed .length ();
379
- const char * normed_str = unichars[unichar_id].properties .normed .string ();
380
- int step = 0 ;
381
- for (int offset = 0 ; offset < length; offset+= step) {
382
- step = normed_step (normed_str + offset);
383
- if (step == 0 ) {
384
- unichars[unichar_id].properties .normed_ids .truncate (0 );
385
- unichars[unichar_id].properties .normed_ids .push_back (unichar_id);
386
- break ;
387
- }
388
- int normed_id = unichar_to_id (normed_str + offset, step);
389
- ASSERT_HOST (normed_id >= 0 );
390
- unichars[unichar_id].properties .normed_ids .push_back (normed_id);
350
+ if (unichar_id == UNICHAR_SPACE && id_to_unichar (unichar_id)[0 ] == ' ' ) {
351
+ unichars[unichar_id].properties .normed_ids .push_back (UNICHAR_SPACE);
352
+ } else if (!encode_string (unichars[unichar_id].properties .normed .string (),
353
+ true , &unichars[unichar_id].properties .normed_ids ,
354
+ NULL , NULL )) {
355
+ unichars[unichar_id].properties .normed_ids .truncate (0 );
356
+ unichars[unichar_id].properties .normed_ids .push_back (unichar_id);
391
357
}
392
358
}
393
359
@@ -1015,6 +981,24 @@ void UNICHARSET::set_black_and_whitelist(const char* blacklist,
1015
981
}
1016
982
}
1017
983
984
+ // Returns true if there are any repeated unicodes in the normalized
985
+ // text of any unichar-id in the unicharset.
986
+ bool UNICHARSET::AnyRepeatedUnicodes () const {
987
+ int start_id = 0 ;
988
+ if (has_special_codes ()) start_id = SPECIAL_UNICHAR_CODES_COUNT;
989
+ for (int id = start_id; id < size_used; ++id) {
990
+ // Convert to unicodes.
991
+ GenericVector<int > unicodes;
992
+ if (UNICHAR::UTF8ToUnicode (get_normed_unichar (id), &unicodes) &&
993
+ unicodes.size () > 1 ) {
994
+ for (int u = 1 ; u < unicodes.size (); ++u) {
995
+ if (unicodes[u - 1 ] == unicodes[u]) return true ;
996
+ }
997
+ }
998
+ }
999
+ return false ;
1000
+ }
1001
+
1018
1002
int UNICHARSET::add_script (const char * script) {
1019
1003
for (int i = 0 ; i < script_table_size_used; ++i) {
1020
1004
if (strcmp (script, script_table[i]) == 0 )
0 commit comments