Skip to content

Commit 3458e7c

Browse files
committed
helper script to generate dawg input files from text
1 parent 37f568d commit 3458e7c

File tree

1 file changed

+264
-0
lines changed

1 file changed

+264
-0
lines changed

contrib/genlangdata.pl

+264
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,264 @@
1+
#!/usr/bin/perl
2+
3+
use warnings;
4+
use strict;
5+
use utf8;
6+
7+
use Getopt::Std;
8+
9+
=pod
10+
11+
=head1 NAME
12+
13+
genwordlists.pl - generate word lists for Tesseract
14+
15+
=head1 SYNOPSIS
16+
17+
genwordlists.pl -i large_text_file -d outdir -p lang
18+
19+
=head1 DESCRIPTION
20+
21+
genwordlists.pl -i large_text_file -d outdir -p lang
22+
23+
Creates 4 files in C<outdir>: F<lang.word.bigrams.unsorted>,
24+
F<lang.word.numbers.unsorted>, F<lang.word.punc.unsorted>, and
25+
F<lang.wordlist.unsorted>, which (when sorted) can be used with
26+
C<wordlist2dawg> for Tesseract's language data.
27+
28+
The script can also run as a filter. Given a set of files created
29+
by WikiExtractor (L<http://medialab.di.unipi.it/Project/SemaWiki/Tools/WikiExtractor.py>),
30+
use:
31+
32+
find WikiExtractor -type f | while read i; do \
33+
pfx=$(echo $i|tr '/' '_'); cat $i | \
34+
perl genwordlists.pl -d OUTDIR -p $pfx; done
35+
36+
This will create a set of output files to match each of the files
37+
WikiExtractor created.
38+
39+
To combine these files:
40+
41+
for i in word.bigrams.unsorted word.numbers.unsorted \
42+
word.punc.unsorted wordlist.unsorted; do \
43+
find OUTDIR -name "*$i" -exec cat '{}' \; |\
44+
perl -CS -ane 'BEGIN{my %c=();} chomp;
45+
my($a,$b)=split/\t/;if(defined $c{$a}){$c{$a}+=$b}
46+
else {$c{$a} = $b;} END{while(my($k,$v)=each %c)
47+
{print "$v\t$k\n";}}'|sort -nr > tmp.$i ;done
48+
49+
Followed by:
50+
51+
for i in word.punc.unsorted word.bigrams.unsorted \
52+
word.numbers.unsorted;do cat tmp.$i \
53+
awk -F'\t' '{print $2 "\t" $1}' > real.$i ; done
54+
cat tmp.wordlist.unsorted | awk -F'\t' '{print $2}' \
55+
> real.wordlist.unsorted
56+
57+
Note that, although the langdata repository contains the
58+
counts of each item in most of the punctuation, number, and
59+
bigram files, these files must be filtered to only contain
60+
the first column, otherwise C<wordlist2dawg> will fail to write
61+
the output file.
62+
63+
=head1 CAVEATS
64+
65+
The format of the output files, and how the data are extracted,
66+
is based only on staring at the input files and taking a guess.
67+
They may be wildly inaccurate.
68+
69+
The only part I can say for certain is correct is that digits
70+
are replaced with '?' in the .numbers wordlist. (See F<dict/dict.cpp>
71+
in the Tesseract source).
72+
73+
=head1 COPYRIGHT
74+
75+
Copyright 2014 Jim O'Regan
76+
77+
Licensed under the Apache License, Version 2.0 (the "License");
78+
you may not use this file except in compliance with the License.
79+
You may obtain a copy of the License at
80+
81+
L<http://www.apache.org/licenses/LICENSE-2.0>
82+
83+
Unless required by applicable law or agreed to in writing, software
84+
distributed under the License is distributed on an "AS IS" BASIS,
85+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
86+
See the License for the specific language governing permissions and
87+
limitations under the License.
88+
89+
=head1 SEE ALSO
90+
91+
L<wordlist2dawg(1)>
92+
93+
=cut
94+
95+
# I haven't looked into this too much
96+
my %lig = (
97+
# Longest first
98+
'ffi' => '',
99+
'ct' => "\N{U+E003}",
100+
'ff' => '',
101+
'fi' => '',
102+
'fl' => '',
103+
'st' => '',
104+
);
105+
106+
my %punct;
107+
my %num;
108+
my %bigrams;
109+
my %opts;
110+
my %words;
111+
112+
my $do_ligatures = 0;
113+
114+
getopts("hli:p:d:", \%opts);
115+
116+
if (defined $opts{h}) {
117+
print "Usage: genwordlists [options]\n";
118+
print "-h\tPrints a brief help message\n";
119+
print "-d\tSet the output directory (default is current)\n";
120+
print "-b\tSet the prefix for the language data (e.g., eng for English)\n";
121+
print "-l\tProcess ligatures\n";
122+
print "-i\tSet the input file. If not set, reads from stdin\n";
123+
exit;
124+
}
125+
126+
if (defined $opts{l}) {
127+
$do_ligatures = 1;
128+
}
129+
130+
my $prefix = '';
131+
if (!defined $opts{p}) {
132+
print "Prefix (-p) must be set!\n";
133+
exit;
134+
} else {
135+
if (defined $opts{d}) {
136+
$prefix = $opts{d};
137+
$prefix =~ s/\/$//;
138+
$prefix .= '/';
139+
}
140+
$prefix .= $opts{p};
141+
# Easiest is to drop it, if present, and readd
142+
$prefix =~ s/\.$//;
143+
$prefix .= ".";
144+
}
145+
146+
my $input;
147+
if (defined $opts{i}) {
148+
open ($input, "<", $opts{i}) or die $!;
149+
#} elsif ($#ARGV > 0) {
150+
# open ($input, "<", $ARGV[0]) or die $!;
151+
} else {
152+
$input = *STDIN;
153+
}
154+
binmode $input, ":utf8";
155+
156+
while (<$input>) {
157+
chomp;
158+
tr/\t/ /;
159+
160+
next if (/^<doc/);
161+
next if (/^<\/doc/);
162+
next if (/^$/);
163+
next if (/^[ \t]*$/);
164+
next if (/^\]\]$/);
165+
166+
my @punct = $_ =~ /([ \p{Punct}]*)/g;
167+
for my $i (@punct) {
168+
if(defined($punct{$i})) {
169+
$punct{$i}++;
170+
} else {
171+
$punct{$i} = 1;
172+
}
173+
}
174+
my @rawnumtok = split(/ /);
175+
my @numtok = map { local $_ = $_; s/[0-9]/ /g; $_ } grep(/[0-9]/, @rawnumtok);
176+
for my $i (@numtok) {
177+
if(defined($num{$i})) {
178+
$num{$i}++;
179+
} else {
180+
$num{$i} = 1;
181+
}
182+
}
183+
184+
my @bitoksraw = map { local $_ = $_; s/[0-9]/?/g; $_ } split(/ |[ \p{Punct}][ \p{Punct}]+/);
185+
if ($#bitoksraw > 0) {
186+
my @first = @bitoksraw;
187+
my $discard = shift @bitoksraw;
188+
for (my $j = 0; $j != $#first; $j++) {
189+
if ($bitoksraw[$j] ne '' && $first[$j] ne '') {
190+
my $tok = $first[$j] . " " . $bitoksraw[$j];
191+
#Not keeping count of these, but this can be useful for trimming
192+
if(defined($bigrams{$tok})) {
193+
$bigrams{$tok}++;
194+
} else {
195+
$bigrams{$tok} = 1;
196+
}
197+
if($do_ligatures == 1) {
198+
my $other = do_lig($tok);
199+
if ($other ne $tok) {
200+
if(defined($bigrams{$other})) {
201+
$bigrams{$other}++;
202+
} else {
203+
$bigrams{$other} = 1;
204+
}
205+
}
206+
}
207+
}
208+
}
209+
}
210+
my @wordl = grep { !/[0-9 \p{Punct}]/ } split (/[ \p{Punct}]+/);
211+
if ($#wordl >= 0) {
212+
for my $word (@wordl) {
213+
if (defined $words{$word}) {
214+
$words{$word}++;
215+
} else {
216+
$words{$word} = 1;
217+
}
218+
}
219+
}
220+
}
221+
222+
if (defined $opts{i}) {
223+
close $input;
224+
}
225+
226+
open(BIGRAMS, ">", "${prefix}word.bigrams.unsorted");
227+
binmode BIGRAMS, ":utf8";
228+
while (my($k, $v) = each %bigrams) {
229+
print BIGRAMS "$k\t$v\n";
230+
}
231+
close BIGRAMS;
232+
%bigrams = ();
233+
234+
open(PUNCT, ">", "${prefix}word.punc.unsorted");
235+
binmode PUNCT, ":utf8";
236+
while (my($k, $v) = each %punct) {
237+
print PUNCT "$k\t$v\n";
238+
}
239+
close PUNCT;
240+
%punct = ();
241+
242+
open(NUMS, ">", "${prefix}word.numbers.unsorted");
243+
binmode NUMS, ":utf8";
244+
while (my($k, $v) = each %num) {
245+
print NUMS "$k\t$v\n";
246+
}
247+
close NUMS;
248+
%num = ();
249+
250+
open(WORDS, ">", "${prefix}wordlist.unsorted");
251+
binmode WORDS, ":utf8";
252+
while (my($k, $v) = each %words) {
253+
print WORDS "$k\t$v\n";
254+
}
255+
close WORDS;
256+
%words = ();
257+
258+
sub do_lig {
259+
my $word = shift;
260+
while (my($k, $v) = each %lig) {
261+
$word =~ s/$k/$v/g;
262+
}
263+
$word;
264+
}

0 commit comments

Comments
 (0)