1
+ % option reentrant noyywrap case - insensitive
2
+ % option extra- type= " com::wavii::pfp::tokenizer::tokenizer_out *"
3
+ % {
4
+
5
+ #include < pfp/ tokenizer. h>
6
+
7
+ % }
8
+
9
+ SGML < \/ ? [A - Za - z! ][^ > ]* >
10
+ SPMDASH & (MD | mdash);| \x96| \x97| \xe2\x80\x93| \xe2\x80\x94
11
+ SPAMP & amp;
12
+ SPPUNC & (HT | TL | UR | LR | QC | QL | QR | odq| cdq| lt| gt| #[0 - 9 ]+ );
13
+ SPLET & [aeiouAEIOU](acute| grave| uml);
14
+ SPACE [ \t]+
15
+ SPACENL [ \t\r\n]+
16
+ SENTEND [ \t\n][ \t\n]+ | [ \t\n]+ ([A - Z ]| {SGML })
17
+ DIGIT [0 - 9 ]
18
+ DATE {DIGIT }{1 ,2 }[\- \/ ]{DIGIT }{1 ,2 }[\- \/ ]{DIGIT }{2 ,4 }
19
+ NUM {DIGIT }+ | {DIGIT }* ([. : ,]{DIGIT }+ )+
20
+ NUMBER [\-+ ]? {NUM }| \({NUM }\)
21
+ /* Constrain fraction to only match likely fractions */
22
+ FRAC ({DIGIT }{1 ,4 }[- ])? {DIGIT }{1 ,4 }\\? \/ {DIGIT }{1 ,4 }
23
+ FRAC2 \xc2\xbc| \xc2\xbd| \xc2\xbe
24
+ DOLSIGN ([A - Z ]* \$| #)
25
+ DOLSIGN2 \xc2\xa2| \xc2\xa3| \xc2\x80| \xe2\x82\xac
26
+ /* not used DOLLAR {DOLSIGN}[ \t]*{NUMBER} */
27
+ /* |\( ?{NUMBER} ?\)) # is for pound signs */
28
+ WORD ([A - Za - z]| \xc3[\x80- \xbf]| {SPLET })+
29
+ /* The $ was for things like New$ */
30
+ /* WAS: only keep hyphens with short one side like co-ed */
31
+ /* But treebank just allows hyphenated things as words! */
32
+ THING [A - Za - z0- 9 ]+ ([_- ][A - Za - z0- 9 ]+ )*
33
+ THINGA [A - Z ]+ (([+ & ]| {SPAMP })[A - Z ]+ )+
34
+ THING3 [A - Za - z0- 9 ]+ (- [A - Za - z]+ ){0 ,2 }(\\? \/ [A - Za - z0- 9 ]+ (- [A - Za - z]+ ){0 ,2 }){1 ,2 }
35
+ APOS [' ]|\x c2\x 92|\x e2\x 80\x 99|'
36
+ HTHING ([A-Za-z0-9][A-Za-z0-9%.,]*(-([A-Za-z0-9]+|{ACRO}\. ))+)|[dDOlL]{APOS}{THING}
37
+ REDAUX {APOS}([msdMSD]|re|ve|ll)
38
+ /* For things that will have n' t on the end. They can' t end in ' n' */
39
+ SWORD [A-Za-z]*[A-MO-Za-mo-z]
40
+ SREDAUX n{APOS}t
41
+ /* Tokens you want but already okay: C' mon ' n' ' [2-9]0s ' [eE]m ' till?
42
+ [Yy]' all ' Cause Shi' ite B ' Gosh o' clock. Here now only need apostrophe
43
+ final words. */
44
+ APOWORD {APOS }n{APOS }? | [lLdDjJ]' |Dunkin{APOS}|somethin{APOS}|ol{APOS}|{APOS}em|C{APOS}mon|{APOS}[2-9]0s|{APOS}till?|o{APOS}clock|[A-Za-z][a-z]*[aeiou]{APOS}[aeiou][a-z]*|{APOS}cause
45
+ FULLURL https?:\/\/ [^ \t\n\f\r\" <>|()]+[^ \t\n\f\r\" <>|.!?(){},-]
46
+ LIKELYURL ((www\. ([^ \t\n\f\r\" <>|.!?(){},]+\. )+[a-zA-Z]{2,4})|(([^ \t\n\f\r\" `' <> | . ! ? (){},- _$]+ \. )+ (com| net| org| edu)))(\/ [^ \t\n\f\r\" <>|()]+[^ \t\n\f\r\" <>|.!?(){},-])?
47
+ EMAIL [a-zA-Z0-9][^ \t\n\f\r\" <>|()]*@([^ \t\n\f\r\" <>|().]+\. )+[a-zA-Z]{2,4}
48
+
49
+ /* Abbreviations - induced from 1987 WSJ by hand */
50
+ ABMONTH Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec
51
+ /* Jun and Jul barely occur, but don't seem dangerous */
52
+ ABDAYS Mon|Tue|Tues|Wed|Thu|Thurs|Fri
53
+ /* In caseless, |a\. m|p\. m handled as ACRO, and this is better as can often
54
+ be followed by capitalized. */
55
+ /* Sat. and Sun. barely occur and can easily lead to errors, so we omit them */
56
+ ABSTATE Calif|Mass|Conn|Fla|Ill|Mich|Pa|Va|Ariz|Tenn|Mo|Md|Wis|Minn|Ind|Okla|Wash|Kan|Ore|Ga|Colo|Ky|Del|Ala|La|Nev|Neb|Ark|Miss|Vt|Wyo|Tex
57
+ ACRO [A-Za-z](\. [A-Za-z])+|(Canada|Sino|Korean|EU|Japan|non)-U\. S|U\. S\. -(U\. K|U\. S\. S\. R)
58
+ ABTITLE Mr|Mrs|Ms|Miss|Drs?|Profs?|Sens?|Reps?|Lt|Col|Gen|Messrs|Govs?|Adm|Rev|Maj|Sgt|Pvt|Mt|Capt|St|Ave|Pres
59
+ ABPTIT Jr|Bros|Sr
60
+ ABCOMP Inc|Cos?|Corp|Pty|Ltd|Plc|Bancorp|Dept|Mfg|Bhd|Assn
61
+ ABNUM Nos?|Prop|Ph
62
+ /* p used to be in ABNUM list, but it can't be any more, since the lexer
63
+ is now caseless. We don't want to have it recognized for P. Both
64
+ p. and P. are now under ABBREV4. ABLIST also went away as no-op [a-e] */
65
+ /* ABBREV1 abbreviations are normally followed by lower case words. If
66
+ they're followed by an uppercase one, we assume there is also a
67
+ sentence boundary */
68
+ ABBREV3 {ABMONTH}|{ABDAYS}|{ABSTATE}|{ABCOMP}|{ABNUM}|{ABPTIT}|etc|ft
69
+ ABBREV1 {ABBREV3}\.
70
+
71
+ /* ABRREV2 abbreviations are normally followed by an upper case word. We
72
+ assume they aren't used sentence finally */
73
+ /* ACRO Is a bad case -- can go either way! */
74
+ ABBREV4 [A-Za-z]|{ABTITLE}|vs|Alex|Cie|a\. k\. a|TREAS|{ACRO}
75
+ ABBREV2 {ABBREV4}\.
76
+ /* Cie. is used before French companies */
77
+ /* in the WSJ Alex. is generally an abbreviation for Alex. Brown, brokers! */
78
+ /* In tables: Mkt. for market Div. for division of company, Chg., Yr.: year */
79
+
80
+ PHONE \( [0-9]{3}\)\ ?[0-9]{3}[\- ][0-9]{4}
81
+ OPBRAC [<\[ ]
82
+ CLBRAC [>\] ]
83
+ HYPHENS \- +|(\x e8\x 88\x 92)+
84
+ LDOTS \. {3,5}|(\.\ ){2,4}\. |\x c2\x 85|\x e2\x 80\x a6
85
+ ATS @+
86
+ UNDS _+
87
+ ASTS \* +|(\\\* ){1,3}
88
+ HASHES #+
89
+ FNMARKS {ATS}|{HASHES}|{UNDS}
90
+ INSENTP [,;:]
91
+ QUOTES `|{APOS}|``|''|(\x e2\x 80\x 98|\x e2\x 80\x 99|\x e2\x 80\x 9c|\x e2\x 80\x 9d|\x c2\x 91|\x c2\x 92|\x c2\x 93|\x c2\x 94){1,2}
92
+ DBLQUOT \" |"
93
+ TBSPEC -(RRB|LRB|RCB|LCB|RSB|LSB)-|C\. D\. s|D'Amico|M'Bow|pro-|anti-|S&P-500|Jos\. |cont'd\. ?|B'Gosh|S&Ls|N'Ko|'twas
94
+ TBSPEC2 {APOS}[0-9][0-9]
95
+
96
+ %%
97
+
98
+ {SGML} { yyextra->put(yytext); }
99
+ {SPMDASH} { yyextra->put(" -- " ); }
100
+ {SPAMP} { yyextra->put(" & " ); }
101
+ {SPPUNC} { yyextra->put(yytext); }
102
+ {WORD}/{REDAUX} { yyextra->put(yytext); }
103
+ {SWORD}/{SREDAUX} { yyextra->put(yytext); }
104
+ {WORD} { yyextra->put_american(yytext); }
105
+ {APOWORD} { yyextra->put(yytext); }
106
+ {FULLURL} { yyextra->put(yytext); }
107
+ {LIKELYURL} { yyextra->put(yytext); }
108
+ {EMAIL} { yyextra->put(yytext); }
109
+ {REDAUX}/[^A-Za-z] { yyextra->put_cp1252(yytext); }
110
+ {SREDAUX} { yyextra->put_cp1252(yytext); }
111
+ {DATE} { yyextra->put(yytext); }
112
+ {NUMBER} { yyextra->put(yytext); }
113
+ {FRAC} { yyextra->put_escape(yytext, '/'); }
114
+ {FRAC2} { yyextra->put_cp1252(yytext); }
115
+ {TBSPEC} { yyextra->put(yytext); }
116
+ {THING3} { yyextra->put_escape(yytext, '/'); }
117
+ {DOLSIGN} { yyextra->put(yytext); }
118
+ {DOLSIGN2} { yyextra->put_cp1252(yytext); }
119
+ {ABBREV1}/{SENTEND} { yyextra->put(yytext); /* TODO: reinstate this when i can figure out how to get flex/jflex case-insensitivity to be the same: unput('.'); */ /* return a period for next time */ }
120
+ {ABBREV1} { yyextra->put(yytext); }
121
+ {ABBREV2} { yyextra->put(yytext); }
122
+ {ABBREV4}/{SPACE} { yyextra->put(yytext); }
123
+ {ACRO}/{SPACENL} { yyextra->put(yytext); }
124
+ {TBSPEC2}/{SPACENL} { yyextra->put(yytext); }
125
+ {WORD}\. /{INSENTP} { yyextra->put(yytext); }
126
+ {PHONE} { yyextra->put(yytext); }
127
+ {DBLQUOT}/[A-Za-z0-9$] { yyextra->put(" ``" ); }
128
+ {DBLQUOT} { yyextra->put(" ' ' " ); }
129
+ \+ { yyextra->put(yytext); }
130
+ %|& { yyextra->put(yytext); }
131
+ \~ |\^ { yyextra->put(yytext); }
132
+ \| |\\ |0x7f {}
133
+ {OPBRAC} { yyextra->put(" - LRB - " ); }
134
+ {CLBRAC} { yyextra->put(" - RRB - " ); }
135
+ \{ { yyextra->put(" - LCB - " ); }
136
+ \} { yyextra->put(" - RCB - " ); }
137
+ \( { yyextra->put(" - LRB - " ); }
138
+ \) { yyextra->put(" - RRB - " ); }
139
+ {HYPHENS} { if (yyleng >= 3 && yyleng <= 4) yyextra->put(" -- " ); else yyextra->put(yytext); }
140
+ {LDOTS} { yyextra->put(" ... " ); }
141
+ {FNMARKS} { yyextra->put(yytext); }
142
+ {ASTS} { yyextra->put_escape(yytext, '*'); }
143
+ {INSENTP} { yyextra->put(yytext); }
144
+ \. |\? |\! { yyextra->put(yytext); }
145
+ = { yyextra->put(yytext); }
146
+ \/ { yyextra->put_escape(yytext, '/'); }
147
+ {HTHING}/[^a-zA-Z0-9.+] { yyextra->put(yytext); }
148
+ {THING} { yyextra->put(yytext); }
149
+ {THINGA} { yyextra->put_amp(yytext); }
150
+ '[A-Za-z]. { yyextra->put(" `" ); yyless(1); /* invert quote - using trailing context didn't work.... */ }
151
+ {REDAUX} { yyextra->put_cp1252(yytext); }
152
+ {QUOTES} { yyextra->put_cp1252(yytext); }
153
+ \0 |{SPACE} { }
154
+ \n |\r |\r\n { }
155
+ { }
156
+ . { yyextra->err(yytext); }
157
+
158
+ %%
159
+
160
+ void com::wavii::pfp::tokenizer::tokenize(const std::string & in, std::vector<std::string> & out) const
161
+ {
162
+ yyscan_t scanner;
163
+ tokenizer_out to(out, *this);
164
+ yylex_init_extra( &to, &scanner );
165
+ yy_scan_string(in.c_str(), scanner);
166
+ yylex(scanner);
167
+ yylex_destroy(scanner);
168
+ }
0 commit comments