Skip to content
This repository was archived by the owner on Apr 24, 2020. It is now read-only.

Commit c1013ff

Browse files
committed
Initial import.
0 parents  commit c1013ff

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+187675
-0
lines changed

CMakeLists.txt

+57
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
PROJECT(pfp)
2+
3+
CMAKE_MINIMUM_REQUIRED(VERSION 2.4)
4+
5+
SET(CMAKE_VERBOSE_MAKEFILE ON)
6+
7+
# binaries get installed here
8+
SET(CMAKE_INSTALL_PREFIX "/usr/")
9+
10+
#add definitions, compiler switches, etc.
11+
ADD_DEFINITIONS(-Wall -O3 -DNDEBUG -march=native -mtune=native `getconf LFS_CFLAGS`)
12+
13+
INCLUDE_DIRECTORIES(include /usr/include/python2.6)
14+
15+
ADD_EXECUTABLE(test
16+
src/test/lexicon.cpp
17+
src/test/pcfg_parser.cpp
18+
src/test/tokenizer.cpp
19+
src/test/main.cpp
20+
)
21+
22+
ADD_EXECUTABLE(pfpc
23+
src/pfpc/main.cpp
24+
)
25+
26+
ADD_LIBRARY(pfp SHARED
27+
src/pfp/config
28+
src/pfp/tokenizer.yy
29+
)
30+
31+
ADD_EXECUTABLE(pfpd
32+
src/pfpd/main
33+
src/pfpd/pfpd_handler
34+
src/moost/http/mime_types
35+
src/moost/http/reply
36+
src/moost/http/request_parser
37+
)
38+
39+
ADD_LIBRARY(pypfp SHARED
40+
src/pypfp/pypfp
41+
)
42+
43+
TARGET_LINK_LIBRARIES(pypfp pfp boost_python boost_filesystem boost_thread boost_regex boost_system icuio)
44+
TARGET_LINK_LIBRARIES(pfpd pfp boost_filesystem boost_thread boost_regex boost_system icuio)
45+
TARGET_LINK_LIBRARIES(pfpc pfp boost_filesystem boost_thread boost_regex boost_system icuio)
46+
TARGET_LINK_LIBRARIES(test pfp boost_thread boost_unit_test_framework boost_regex icuio)
47+
48+
INSTALL(TARGETS pfpd DESTINATION bin)
49+
INSTALL(TARGETS pfpc DESTINATION bin)
50+
51+
INSTALL(TARGETS pfp LIBRARY DESTINATION lib)
52+
INSTALL(DIRECTORY share/pfp DESTINATION share)
53+
54+
# use this to make sure libpypfp.so is instead called pfp.so and installed in /usr/lib/python2.6/dist-packages
55+
SET_TARGET_PROPERTIES(pypfp PROPERTIES OUTPUT_NAME "pfp")
56+
SET_TARGET_PROPERTIES(pypfp PROPERTIES PREFIX "")
57+
INSTALL(TARGETS pypfp LIBRARY DESTINATION lib/python2.6/dist-packages)

etc/pfp/tokenizer.flex

+168
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
%option reentrant noyywrap case-insensitive
2+
%option extra-type="com::wavii::pfp::tokenizer::tokenizer_out *"
3+
%{
4+
5+
#include <pfp/tokenizer.h>
6+
7+
%}
8+
9+
SGML <\/?[A-Za-z!][^>]*>
10+
SPMDASH &(MD|mdash);|\x96|\x97|\xe2\x80\x93|\xe2\x80\x94
11+
SPAMP &amp;
12+
SPPUNC &(HT|TL|UR|LR|QC|QL|QR|odq|cdq|lt|gt|#[0-9]+);
13+
SPLET &[aeiouAEIOU](acute|grave|uml);
14+
SPACE [ \t]+
15+
SPACENL [ \t\r\n]+
16+
SENTEND [ \t\n][ \t\n]+|[ \t\n]+([A-Z]|{SGML})
17+
DIGIT [0-9]
18+
DATE {DIGIT}{1,2}[\-\/]{DIGIT}{1,2}[\-\/]{DIGIT}{2,4}
19+
NUM {DIGIT}+|{DIGIT}*([.:,]{DIGIT}+)+
20+
NUMBER [\-+]?{NUM}|\({NUM}\)
21+
/* Constrain fraction to only match likely fractions */
22+
FRAC ({DIGIT}{1,4}[- ])?{DIGIT}{1,4}\\?\/{DIGIT}{1,4}
23+
FRAC2 \xc2\xbc|\xc2\xbd|\xc2\xbe
24+
DOLSIGN ([A-Z]*\$|#)
25+
DOLSIGN2 \xc2\xa2|\xc2\xa3|\xc2\x80|\xe2\x82\xac
26+
/* not used DOLLAR {DOLSIGN}[ \t]*{NUMBER} */
27+
/* |\( ?{NUMBER} ?\)) # is for pound signs */
28+
WORD ([A-Za-z]|\xc3[\x80-\xbf]|{SPLET})+
29+
/* The $ was for things like New$ */
30+
/* WAS: only keep hyphens with short one side like co-ed */
31+
/* But treebank just allows hyphenated things as words! */
32+
THING [A-Za-z0-9]+([_-][A-Za-z0-9]+)*
33+
THINGA [A-Z]+(([+&]|{SPAMP})[A-Z]+)+
34+
THING3 [A-Za-z0-9]+(-[A-Za-z]+){0,2}(\\?\/[A-Za-z0-9]+(-[A-Za-z]+){0,2}){1,2}
35+
APOS [']|\xc2\x92|\xe2\x80\x99|&apos;
36+
HTHING ([A-Za-z0-9][A-Za-z0-9%.,]*(-([A-Za-z0-9]+|{ACRO}\.))+)|[dDOlL]{APOS}{THING}
37+
REDAUX {APOS}([msdMSD]|re|ve|ll)
38+
/* For things that will have n't on the end. They can't end in 'n' */
39+
SWORD [A-Za-z]*[A-MO-Za-mo-z]
40+
SREDAUX n{APOS}t
41+
/* Tokens you want but already okay: C'mon 'n' '[2-9]0s '[eE]m 'till?
42+
[Yy]'all 'Cause Shi'ite B'Gosh o'clock. Here now only need apostrophe
43+
final words. */
44+
APOWORD {APOS}n{APOS}?|[lLdDjJ]'|Dunkin{APOS}|somethin{APOS}|ol{APOS}|{APOS}em|C{APOS}mon|{APOS}[2-9]0s|{APOS}till?|o{APOS}clock|[A-Za-z][a-z]*[aeiou]{APOS}[aeiou][a-z]*|{APOS}cause
45+
FULLURL https?:\/\/[^ \t\n\f\r\"<>|()]+[^ \t\n\f\r\"<>|.!?(){},-]
46+
LIKELYURL ((www\.([^ \t\n\f\r\"<>|.!?(){},]+\.)+[a-zA-Z]{2,4})|(([^ \t\n\f\r\"`'<>|.!?(){},-_$]+\.)+(com|net|org|edu)))(\/[^ \t\n\f\r\"<>|()]+[^ \t\n\f\r\"<>|.!?(){},-])?
47+
EMAIL [a-zA-Z0-9][^ \t\n\f\r\"<>|()]*@([^ \t\n\f\r\"<>|().]+\.)+[a-zA-Z]{2,4}
48+
49+
/* Abbreviations - induced from 1987 WSJ by hand */
50+
ABMONTH Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec
51+
/* Jun and Jul barely occur, but don't seem dangerous */
52+
ABDAYS Mon|Tue|Tues|Wed|Thu|Thurs|Fri
53+
/* In caseless, |a\.m|p\.m handled as ACRO, and this is better as can often
54+
be followed by capitalized. */
55+
/* Sat. and Sun. barely occur and can easily lead to errors, so we omit them */
56+
ABSTATE Calif|Mass|Conn|Fla|Ill|Mich|Pa|Va|Ariz|Tenn|Mo|Md|Wis|Minn|Ind|Okla|Wash|Kan|Ore|Ga|Colo|Ky|Del|Ala|La|Nev|Neb|Ark|Miss|Vt|Wyo|Tex
57+
ACRO [A-Za-z](\.[A-Za-z])+|(Canada|Sino|Korean|EU|Japan|non)-U\.S|U\.S\.-(U\.K|U\.S\.S\.R)
58+
ABTITLE Mr|Mrs|Ms|Miss|Drs?|Profs?|Sens?|Reps?|Lt|Col|Gen|Messrs|Govs?|Adm|Rev|Maj|Sgt|Pvt|Mt|Capt|St|Ave|Pres
59+
ABPTIT Jr|Bros|Sr
60+
ABCOMP Inc|Cos?|Corp|Pty|Ltd|Plc|Bancorp|Dept|Mfg|Bhd|Assn
61+
ABNUM Nos?|Prop|Ph
62+
/* p used to be in ABNUM list, but it can't be any more, since the lexer
63+
is now caseless. We don't want to have it recognized for P. Both
64+
p. and P. are now under ABBREV4. ABLIST also went away as no-op [a-e] */
65+
/* ABBREV1 abbreviations are normally followed by lower case words. If
66+
they're followed by an uppercase one, we assume there is also a
67+
sentence boundary */
68+
ABBREV3 {ABMONTH}|{ABDAYS}|{ABSTATE}|{ABCOMP}|{ABNUM}|{ABPTIT}|etc|ft
69+
ABBREV1 {ABBREV3}\.
70+
71+
/* ABRREV2 abbreviations are normally followed by an upper case word. We
72+
assume they aren't used sentence finally */
73+
/* ACRO Is a bad case -- can go either way! */
74+
ABBREV4 [A-Za-z]|{ABTITLE}|vs|Alex|Cie|a\.k\.a|TREAS|{ACRO}
75+
ABBREV2 {ABBREV4}\.
76+
/* Cie. is used before French companies */
77+
/* in the WSJ Alex. is generally an abbreviation for Alex. Brown, brokers! */
78+
/* In tables: Mkt. for market Div. for division of company, Chg., Yr.: year */
79+
80+
PHONE \([0-9]{3}\)\ ?[0-9]{3}[\- ][0-9]{4}
81+
OPBRAC [<\[]
82+
CLBRAC [>\]]
83+
HYPHENS \-+|(\xe8\x88\x92)+
84+
LDOTS \.{3,5}|(\.\ ){2,4}\.|\xc2\x85|\xe2\x80\xa6
85+
ATS @+
86+
UNDS _+
87+
ASTS \*+|(\\\*){1,3}
88+
HASHES #+
89+
FNMARKS {ATS}|{HASHES}|{UNDS}
90+
INSENTP [,;:]
91+
QUOTES `|{APOS}|``|''|(\xe2\x80\x98|\xe2\x80\x99|\xe2\x80\x9c|\xe2\x80\x9d|\xc2\x91|\xc2\x92|\xc2\x93|\xc2\x94){1,2}
92+
DBLQUOT \"|&quot;
93+
TBSPEC -(RRB|LRB|RCB|LCB|RSB|LSB)-|C\.D\.s|D'Amico|M'Bow|pro-|anti-|S&P-500|Jos\.|cont'd\.?|B'Gosh|S&Ls|N'Ko|'twas
94+
TBSPEC2 {APOS}[0-9][0-9]
95+
96+
%%
97+
98+
{SGML} { yyextra->put(yytext); }
99+
{SPMDASH} { yyextra->put("--"); }
100+
{SPAMP} { yyextra->put("&"); }
101+
{SPPUNC} { yyextra->put(yytext); }
102+
{WORD}/{REDAUX} { yyextra->put(yytext); }
103+
{SWORD}/{SREDAUX} { yyextra->put(yytext); }
104+
{WORD} { yyextra->put_american(yytext); }
105+
{APOWORD} { yyextra->put(yytext); }
106+
{FULLURL} { yyextra->put(yytext); }
107+
{LIKELYURL} { yyextra->put(yytext); }
108+
{EMAIL} { yyextra->put(yytext); }
109+
{REDAUX}/[^A-Za-z] { yyextra->put_cp1252(yytext); }
110+
{SREDAUX} { yyextra->put_cp1252(yytext); }
111+
{DATE} { yyextra->put(yytext); }
112+
{NUMBER} { yyextra->put(yytext); }
113+
{FRAC} { yyextra->put_escape(yytext, '/'); }
114+
{FRAC2} { yyextra->put_cp1252(yytext); }
115+
{TBSPEC} { yyextra->put(yytext); }
116+
{THING3} { yyextra->put_escape(yytext, '/'); }
117+
{DOLSIGN} { yyextra->put(yytext); }
118+
{DOLSIGN2} { yyextra->put_cp1252(yytext); }
119+
{ABBREV1}/{SENTEND} { yyextra->put(yytext); /* TODO: reinstate this when i can figure out how to get flex/jflex case-insensitivity to be the same: unput('.'); */ /* return a period for next time */ }
120+
{ABBREV1} { yyextra->put(yytext); }
121+
{ABBREV2} { yyextra->put(yytext); }
122+
{ABBREV4}/{SPACE} { yyextra->put(yytext); }
123+
{ACRO}/{SPACENL} { yyextra->put(yytext); }
124+
{TBSPEC2}/{SPACENL} { yyextra->put(yytext); }
125+
{WORD}\./{INSENTP} { yyextra->put(yytext); }
126+
{PHONE} { yyextra->put(yytext); }
127+
{DBLQUOT}/[A-Za-z0-9$] { yyextra->put("``"); }
128+
{DBLQUOT} { yyextra->put("''"); }
129+
\+ { yyextra->put(yytext); }
130+
%|& { yyextra->put(yytext); }
131+
\~|\^ { yyextra->put(yytext); }
132+
\||\\|0x7f {}
133+
{OPBRAC} { yyextra->put("-LRB-"); }
134+
{CLBRAC} { yyextra->put("-RRB-"); }
135+
\{ { yyextra->put("-LCB-"); }
136+
\} { yyextra->put("-RCB-"); }
137+
\( { yyextra->put("-LRB-"); }
138+
\) { yyextra->put("-RRB-"); }
139+
{HYPHENS} { if (yyleng >= 3 && yyleng <= 4) yyextra->put("--"); else yyextra->put(yytext); }
140+
{LDOTS} { yyextra->put("..."); }
141+
{FNMARKS} { yyextra->put(yytext); }
142+
{ASTS} { yyextra->put_escape(yytext, '*'); }
143+
{INSENTP} { yyextra->put(yytext); }
144+
\.|\?|\! { yyextra->put(yytext); }
145+
= { yyextra->put(yytext); }
146+
\/ { yyextra->put_escape(yytext, '/'); }
147+
{HTHING}/[^a-zA-Z0-9.+] { yyextra->put(yytext); }
148+
{THING} { yyextra->put(yytext); }
149+
{THINGA} { yyextra->put_amp(yytext); }
150+
'[A-Za-z]. { yyextra->put("`"); yyless(1); /* invert quote - using trailing context didn't work.... */ }
151+
{REDAUX} { yyextra->put_cp1252(yytext); }
152+
{QUOTES} { yyextra->put_cp1252(yytext); }
153+
\0|{SPACE} { }
154+
\n|\r|\r\n { }
155+
&nbsp; { }
156+
. { yyextra->err(yytext); }
157+
158+
%%
159+
160+
void com::wavii::pfp::tokenizer::tokenize(const std::string & in, std::vector<std::string> & out) const
161+
{
162+
yyscan_t scanner;
163+
tokenizer_out to(out, *this);
164+
yylex_init_extra( &to, &scanner );
165+
yy_scan_string(in.c_str(), scanner);
166+
yylex(scanner);
167+
yylex_destroy(scanner);
168+
}

0 commit comments

Comments
 (0)