Skip to content

Commit ffc6529

Browse files
authored
Merge pull request #1331 from linas/affix-fix
Add assorted affixes to the English language dict
2 parents 46a2d31 + fa24d74 commit ffc6529

File tree

4 files changed

+62
-43
lines changed

4 files changed

+62
-43
lines changed

data/en/4.0.affix

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
% Punctuation appearing on the right-side of words.
99
")" "}" "]" ">" """ » 〉 ) 〕 》 】 ] 』 」 "’’" "’" ” '' ' `
1010
"%" "," ... "." 。 ‧ ":" ";" "?" "!" ‽ ؟ ? !
11-
_ ‐ ‑ ‒ – — ― … ━ – ー ‐ 、=
11+
_ ‐ ‑ ‒ – — ― … ━ – ー ‐ 、 =
1212
~ ¢ ₵ ™ ℠
1313
: RPUNC+;
1414

data/en/4.0.dict

Lines changed: 29 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -10061,7 +10061,7 @@ later earlier:
1006110061
(Wt- & {Xc+}) or
1006210062
[({Xc+ & {Xd-}} & dCO+)] or
1006310063
(Xd- & Xc+ & (MX*x- or MVx-)) or
10064-
({[[@Ec-]]} & {Xc+} & A+) or
10064+
({[[@Ec-]]} & {{Xdp-} & Xc+} & A+) or
1006510065
dAJrc- or dAJlc+)) or
1006610066
(Yt- & (<advcl-verb> or Qe+));
1006710067

@@ -11446,11 +11446,11 @@ just_not: <COMP-OPENER>;
1144611446

1144711447
% PH-: connect, phonetically, to a/an if it is there.
1144811448
<adj-consn>:
11449-
({EA- or ({(<wantPHc>)} & {EF+ or MX*ta+})} & {[[@Ec-]]} & {Xc+} & A+);
11449+
({EA- or ({(<wantPHc>)} & {EF+ or MX*ta+})} & {[[@Ec-]]} & {{Xdp-} & Xc+} & A+);
1145011450
<adj-vowel>:
11451-
({EA- or ({(<wantPHv>)} & {EF+ or MX*ta+})} & {[[@Ec-]]} & {Xc+} & A+);
11451+
({EA- or ({(<wantPHv>)} & {EF+ or MX*ta+})} & {[[@Ec-]]} & {{Xdp-} & Xc+} & A+);
1145211452
<adj-phone>:
11453-
({EA- or ({(<wantPH>)} & {EF+ or MX*ta+})} & {[[@Ec-]]} & {Xc+} & A+);
11453+
({EA- or ({(<wantPH>)} & {EF+ or MX*ta+})} & {[[@Ec-]]} & {{Xdp-} & Xc+} & A+);
1145411454

1145511455
/en/words/words.adj.1-vowel :
1145611456
<marker-common-entity> or
@@ -11488,7 +11488,7 @@ frank.a:
1148811488
% [A-]0.2: "a big green apple" want "big" to modify "apple", not "green"
1148911489
<color-adj>:
1149011490
({EA- or EF+} & {(AN- or [A-]0.2) & {Ds-}} &
11491-
(({[[@Ec-]]} & {Xc+} & A+)
11491+
(({[[@Ec-]]} & {{Xdp-} & Xc+} & A+)
1149211492
or ((Pa- or AF+ or Ma- or dMJra-) & {@MV+} & {<tot-verb>})
1149311493
or ({@MV+} & dMJla+)
1149411494
or AA+
@@ -11636,7 +11636,7 @@ unusual.a useful.a impossible.a annoying.a unfair.a unuseful.a:
1163611636

1163711637
a_bitch :
1163811638
<adj-good>
11639-
or ({EA- or EF+} & {[[@Ec-]]} & {Xc+} & A+);
11639+
or ({EA- or EF+} & {[[@Ec-]]} & {{Xdp-} & Xc+} & A+);
1164011640

1164111641
% Surely this is incomplete...
1164211642
one_and_only in_situ:
@@ -12366,7 +12366,7 @@ next.a:
1236612366
or ({Xc+} & {[[@Ec-]]} & L-)
1236712367
or (DD- & <noun-rel-x> & {<ton-verb>} & <noun-main-x>);
1236812368

12369-
past.a previous.a: ({[[@Ec-]]} & {Xc+} & A+) or L- or (Pa- & {@MV+});
12369+
past.a previous.a: ({[[@Ec-]]} & {{Xdp-} & Xc+} & A+) or L- or (Pa- & {@MV+});
1237012370

1237112371
following.a remaining.a top.i: L-;
1237212372

@@ -13241,7 +13241,13 @@ so_on the_like vice_versa v.v.:
1324113241
% XXX that is, change <WALL> to just WV+.
1324213242
%
1324313243
<sent-start>:
13244-
(<wo-wall> or <wi-wall>) & {hCPx+ or hCPi+ or hCPu+} & {([Xx+]0.06 or Xp+ or Xs+) & {hWV+}} & {RW+ or Xp+};
13244+
(<wo-wall> or <wi-wall>)
13245+
& {hCPx+ or hCPi+ or hCPu+}
13246+
& {([Xx+]0.06 or Xo- or Xp+ or Xs+) & {hWV+}}
13247+
& {RW+ or Xp+};
13248+
13249+
% Espagnol sentence starters
13250+
¿ ¡: Xo- & <sent-start>;
1324513251

1324613252
% <sent-start>: "So, don't do it!"
1324713253
% The cost on sent-start is to force preference for CV over WV,
@@ -13258,6 +13264,7 @@ so.ij:
1325813264
% hWl+ is here, not elsewhere, to avoid use together with Xx+
1325913265
LEFT-WALL:
1326013266
<sent-start>
13267+
or Xo+
1326113268
or (QUd+ & <sent-start> & (Xc+ or [()]) & QUc+)
1326213269
or (hWl+ & {Xj+} & (RW+ or Xp+))
1326313270
or (QUd+ & hWl+ & {Xj+} & (Xc+ or [()]) & QUc+)
@@ -13282,17 +13289,19 @@ RIGHT-WALL: RW- or ({@Xca-} & [[Xc-]]);
1328213289
<post-quote>:
1328313290
QUc- & {<wo-wall> or <wi-wall> or CP+};
1328413291

13285-
« 《 【 『 „ “:
13292+
% underbar used like a quote-mark, e.g. for _bold text_
13293+
« 《 【 『 「 „ “ ''.x ’’.x :
1328613294
QUd-;
13287-
» 》 】 』 :
13295+
» 》 】 』 」 ” ''.y ’’.y :
1328813296
<post-quote>;
1328913297

1329013298
% For now, using ".x and ".y in the above definitions multiplies the number
1329113299
% of linkages by 2^(number of "). So it is separated below.
1329213300

1329313301
% [[ZZZ-]]: link to "random" quotation marks that show up "for no reason".
1329413302
% Cannot use a blanket W+ here to pick up all W connectors, because ... ??
13295-
""": QUd- or <post-quote> or [[ZZZ-]];
13303+
% Underbar is for _bold text_.
13304+
""" "_": QUd- or <post-quote> or [[ZZZ-]];
1329613305

1329713306
% Using backtic.x and backtic.y in the above definitions multiplies the
1329813307
% number of linkages by 2^(number of backtics). So it is treated as a
@@ -13308,8 +13317,8 @@ RIGHT-WALL: RW- or ({@Xca-} & [[Xc-]]);
1330813317
or Xi-
1330913318
or <sent-split>;
1331013319

13311-
% Optional RW: "Is this a test?" she asked.
13312-
"!" "?" ‽ ؟ ?:
13320+
% Question marks: Optional RW: "Is this a test?" she asked.
13321+
"!" "?" ‽ ؟ ?:
1331313322
(Xp- & RW+)
1331413323
or ({@Xca-} & Xc- & {[RW+]})
1331513324
or ({@Xca-} & Xq+)
@@ -13427,8 +13436,9 @@ but.ij and.ij or.ij not.ij also.ij then.ij but_not and_not and_yet:
1342713436

1342813437
% 、 is the "enumeration-comma" used like a dash ...
1342913438
% ‧ is the "middle dot"
13430-
% The four dashes are e28092 e28093 e28094 e28095
13431-
‒ – — ― ━ ー --.r -.r 、 ~.r ~.r ‧.r :
13439+
% The first two short dashes are e28090 e28091
13440+
% The next four long dashes are e28092 e28093 e28094 e28095
13441+
‐ ‑ ‒ – — ― ━ ー --.r -.r 、 ~.r ~.r ‧.r :
1343213442
[[<colon>]]
1343313443
or ({@Xca-} & (({EBx+} & Xd+) or Xc-))
1343413444
or (Wd- & W+)
@@ -13492,18 +13502,15 @@ but.ij and.ij or.ij not.ij also.ij then.ij but_not and_not and_yet:
1349213502

1349313503
% See also /en/words/currency for currency names that follow a number.
1349413504
$ USD.c US$.c C$.c AUD.c AUD$.c HK.c HK$.c
13495-
£ ₤ € ¤ ₳ ฿ ¢ ₵ ₡ ₢ ₠ ₫ ৳ ƒ ₣ ₲ ₴ ₭ ₺ ℳ ₥ ₦ ₧ ₱ ₰ ₹ ₨ ₪ ₸ ₮ ₩ ¥ ៛ 호점
13505+
£ ₤ € ¤ ₳ ฿ ¢ ₵ ₡ ₢ ₠ ₫ ৳ ƒ ₣ ₲ ₴ ₭ ₺ ℳ ₥ ₦ ₧ ₱ ₰ ₹ ₨ ₪ ₸ ₮ ₩ ¥ ៛ 호점
1349613506
† †† ‡ § ¶ © ® ℗ № "#":
1349713507
NM*x+ & (AN+ or NM*y- or [[G+]] or (NIfu+ or NItu-) or
1349813508
({EN- or NIc- or [[A- & NSa-]]} & {@MX+} &
1349913509
(OD- or ({DD-} & {[[@M+]]} &
1350013510
(<noun-main-p> or <noun-and-p> or [[(Ss+ & <CLAUSE>) or SIs-]])))));
1350113511

1350213512
% service mark, trademark.
13503-
% ℠ ™ :
13504-
13505-
% Espagnol stuff
13506-
% ¿ ¡:
13513+
℠ ™ : G- ;
1350713514

1350813515
"&": G- & {Xd- & G-} & G+;
1350913516

@@ -13521,9 +13528,9 @@ $ USD.c US$.c C$.c AUD.c AUD$.c HK.c HK$.c
1352113528
YS- & (({AL-} & {@L+} & (D+ or DD+)) or [[<noun-main-x>]] or DP+);
1352213529

1352313530
% Wd-: allows "(1 + 1) = 2"
13524-
"(" "[": {Wd-} & {EBx+} & dXdp+;
13531+
"(" "{" "[" "<" 〈 ( 〔 [ : {Wd-} & {EBx+} & dXdp+;
1352513532

13526-
")" "]": {@Xca-} & dXcp-;
13533+
")" "}" "]" ">" 〉 ) 〕 ]: {@Xca-} & dXcp-;
1352713534

1352813535
% foo: F+;
1352913536

data/en/4.0.dict.m4

Lines changed: 26 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -7637,7 +7637,7 @@ later earlier:
76377637
(Wt- & {Xc+}) or
76387638
[({Xc+ & {Xd-}} & dCO+)] or
76397639
(Xd- & Xc+ & (MX*x- or MVx-)) or
7640-
({[[@Ec-]]} & {Xc+} & A+) or
7640+
({[[@Ec-]]} & {{Xdp-} & Xc+} & A+) or
76417641
dAJrc- or dAJlc+)) or
76427642
(Yt- & (<advcl-verb> or Qe+));
76437643

@@ -9021,7 +9021,7 @@ just_not: <COMP-OPENER>;
90219021
% Macro, for all the common parts of an A+ connection, with
90229022
% the phonetic attachment as a variable.
90239023
define(`ADJ_PH',`'
9024-
({EA- or ({($1)} & {EF+ or MX*ta+})} & {[[@Ec-]]} & {Xc+} & A+))
9024+
({EA- or ({($1)} & {EF+ or MX*ta+})} & {[[@Ec-]]} & {{Xdp-} & Xc+} & A+))
90259025

90269026
% PH-: connect, phonetically, to a/an if it is there.
90279027
<adj-consn>: ADJ_PH(<wantPHc>);
@@ -9064,7 +9064,7 @@ frank.a:
90649064
% [A-]0.2: "a big green apple" want "big" to modify "apple", not "green"
90659065
<color-adj>:
90669066
({EA- or EF+} & {(AN- or [A-]0.2) & {Ds-}} &
9067-
(({[[@Ec-]]} & {Xc+} & A+)
9067+
(({[[@Ec-]]} & {{Xdp-} & Xc+} & A+)
90689068
or ((Pa- or AF+ or Ma- or dMJra-) & {@MV+} & {<tot-verb>})
90699069
or ({@MV+} & dMJla+)
90709070
or AA+
@@ -9212,7 +9212,7 @@ unusual.a useful.a impossible.a annoying.a unfair.a unuseful.a:
92129212

92139213
a_bitch :
92149214
<adj-good>
9215-
or ({EA- or EF+} & {[[@Ec-]]} & {Xc+} & A+);
9215+
or ({EA- or EF+} & {[[@Ec-]]} & {{Xdp-} & Xc+} & A+);
92169216

92179217
% Surely this is incomplete...
92189218
one_and_only in_situ:
@@ -9942,7 +9942,7 @@ next.a:
99429942
or ({Xc+} & {[[@Ec-]]} & L-)
99439943
or (DD- & <noun-rel-x> & {<ton-verb>} & <noun-main-x>);
99449944

9945-
past.a previous.a: ({[[@Ec-]]} & {Xc+} & A+) or L- or (Pa- & {@MV+});
9945+
past.a previous.a: ({[[@Ec-]]} & {{Xdp-} & Xc+} & A+) or L- or (Pa- & {@MV+});
99469946

99479947
following.a remaining.a top.i: L-;
99489948

@@ -10817,7 +10817,13 @@ so_on the_like vice_versa v.v.:
1081710817
% XXX that is, change <WALL> to just WV+.
1081810818
%
1081910819
<sent-start>:
10820-
(<wo-wall> or <wi-wall>) & {hCPx+ or hCPi+ or hCPu+} & {([Xx+]0.06 or Xp+ or Xs+) & {hWV+}} & {RW+ or Xp+};
10820+
(<wo-wall> or <wi-wall>)
10821+
& {hCPx+ or hCPi+ or hCPu+}
10822+
& {([Xx+]0.06 or Xo- or Xp+ or Xs+) & {hWV+}}
10823+
& {RW+ or Xp+};
10824+
10825+
% Espagnol sentence starters
10826+
¿ ¡: Xo- & <sent-start>;
1082110827

1082210828
% <sent-start>: "So, don't do it!"
1082310829
% The cost on sent-start is to force preference for CV over WV,
@@ -10834,6 +10840,7 @@ so.ij:
1083410840
% hWl+ is here, not elsewhere, to avoid use together with Xx+
1083510841
LEFT-WALL:
1083610842
<sent-start>
10843+
or Xo+
1083710844
or (QUd+ & <sent-start> & (Xc+ or [()]) & QUc+)
1083810845
or (hWl+ & {Xj+} & (RW+ or Xp+))
1083910846
or (QUd+ & hWl+ & {Xj+} & (Xc+ or [()]) & QUc+)
@@ -10858,17 +10865,18 @@ RIGHT-WALL: RW- or ({@Xca-} & [[Xc-]]);
1085810865
<post-quote>:
1085910866
QUc- & {<wo-wall> or <wi-wall> or CP+};
1086010867

10861-
« 《 【 『 „ “:
10868+
« 《 【 『 „ “ ''.x ’’.x :
1086210869
QUd-;
10863-
» 》 】 』 :
10870+
» 》 】 』 」 ” ''.y ’’.y :
1086410871
<post-quote>;
1086510872

1086610873
% For now, using ".x and ".y in the above definitions multiplies the number
1086710874
% of linkages by 2^(number of "). So it is separated below.
1086810875

1086910876
% [[ZZZ-]]: link to "random" quotation marks that show up "for no reason".
1087010877
% Cannot use a blanket W+ here to pick up all W connectors, because ... ??
10871-
""": QUd- or <post-quote> or [[ZZZ-]];
10878+
% Underbar used like a quote-mark, e.g. for _bold text_
10879+
""" "_": QUd- or <post-quote> or [[ZZZ-]];
1087210880

1087310881
% Using backtic.x and backtic.y in the above definitions multiplies the
1087410882
% number of linkages by 2^(number of backtics). So it is treated as a
@@ -10886,8 +10894,8 @@ changequote dnl
1088610894
or Xi-
1088710895
or <sent-split>;
1088810896

10889-
% Optional RW: "Is this a test?" she asked.
10890-
"!" "?" ‽ ؟ ?:
10897+
% Question marks: Optional RW: "Is this a test?" she asked.
10898+
"!" "?" ‽ ؟ ?:
1089110899
(Xp- & RW+)
1089210900
or ({@Xca-} & Xc- & {[RW+]})
1089310901
or ({@Xca-} & Xq+)
@@ -11005,8 +11013,9 @@ but.ij and.ij or.ij not.ij also.ij then.ij but_not and_not and_yet:
1100511013

1100611014
% 、 is the "enumeration-comma" used like a dash ...
1100711015
% ‧ is the "middle dot"
11008-
% The four dashes are e28092 e28093 e28094 e28095
11009-
‒ – — ― ━ ー --.r -.r 、 ~.r ~.r ‧.r :
11016+
% The first two short dashes are e28090 e28091
11017+
% The next four long dashes are e28092 e28093 e28094 e28095
11018+
‐ ‑ ‒ – — ― ━ ー --.r -.r 、 ~.r ~.r ‧.r :
1101011019
[[<colon>]]
1101111020
or ({@Xca-} & (({EBx+} & Xd+) or Xc-))
1101211021
or (Wd- & W+)
@@ -11070,18 +11079,15 @@ but.ij and.ij or.ij not.ij also.ij then.ij but_not and_not and_yet:
1107011079

1107111080
% See also /en/words/currency for currency names that follow a number.
1107211081
$ USD.c US$.c C$.c AUD.c AUD$.c HK.c HK$.c
11073-
£ ₤ € ¤ ₳ ฿ ¢ ₵ ₡ ₢ ₠ ₫ ৳ ƒ ₣ ₲ ₴ ₭ ₺ ℳ ₥ ₦ ₧ ₱ ₰ ₹ ₨ ₪ ₸ ₮ ₩ ¥ ៛ 호점
11082+
£ ₤ € ¤ ₳ ฿ ¢ ₵ ₡ ₢ ₠ ₫ ৳ ƒ ₣ ₲ ₴ ₭ ₺ ℳ ₥ ₦ ₧ ₱ ₰ ₹ ₨ ₪ ₸ ₮ ₩ ¥ ៛ 호점
1107411083
† †† ‡ § ¶ © ® ℗ № "#":
1107511084
NM*x+ & (AN+ or NM*y- or [[G+]] or (NIfu+ or NItu-) or
1107611085
({EN- or NIc- or [[A- & NSa-]]} & {@MX+} &
1107711086
(OD- or ({DD-} & {[[@M+]]} &
1107811087
(<noun-main-p> or <noun-and-p> or [[(Ss+ & <CLAUSE>) or SIs-]])))));
1107911088

1108011089
% service mark, trademark.
11081-
% ℠ ™ :
11082-
11083-
% Espagnol stuff
11084-
% ¿ ¡:
11090+
℠ ™ : G- ;
1108511091

1108611092
"&": G- & {Xd- & G-} & G+;
1108711093

@@ -11099,9 +11105,9 @@ $ USD.c US$.c C$.c AUD.c AUD$.c HK.c HK$.c
1109911105
YS- & (({AL-} & {@L+} & (D+ or DD+)) or [[<noun-main-x>]] or DP+);
1110011106

1110111107
% Wd-: allows "(1 + 1) = 2"
11102-
"(" "[": {Wd-} & {EBx+} & dXdp+;
11108+
"(" "{" "[" "<" 〈 ( 〔 [ : {Wd-} & {EBx+} & dXdp+;
1110311109
11104-
")" "]": {@Xca-} & dXcp-;
11110+
")" "}" "]" ">" 〉 ) 〕 ]: {@Xca-} & dXcp-;
1110511111

1110611112
% foo: F+;
1110711113

data/en/corpus-fixes.batch

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4317,6 +4317,12 @@ I know I ought to tell her.
43174317
% punctuation: sometimes commas are used as if they were semicolons:
43184318
Sweat stood on his brow, fury was bright in his one good eye.
43194319

4320+
% Spanish punctuation
4321+
¿this is a question?
4322+
¡this is an exclamation!
4323+
4324+
Underbars are used for _bold text_.
4325+
43204326
% Crazy UTF8 dashes
43214327
we are dealing with androcentric—that is to say, male-dominated—concepts
43224328
we are dealing with androcentric--that is to say, male-dominated--concepts

0 commit comments

Comments
 (0)