|
| 1 | +#!/usr/bin/perl |
| 2 | + |
| 3 | +use warnings; |
| 4 | +use strict; |
| 5 | +use utf8; |
| 6 | + |
| 7 | +use Getopt::Std; |
| 8 | + |
| 9 | +=pod |
| 10 | +
|
| 11 | +=head1 NAME |
| 12 | +
|
| 13 | +genwordlists.pl - generate word lists for Tesseract |
| 14 | +
|
| 15 | +=head1 SYNOPSIS |
| 16 | +
|
| 17 | +genwordlists.pl -i large_text_file -d outdir -p lang |
| 18 | +
|
| 19 | +=head1 DESCRIPTION |
| 20 | +
|
| 21 | + genwordlists.pl -i large_text_file -d outdir -p lang |
| 22 | +
|
| 23 | +Creates 4 files in C<outdir>: F<lang.word.bigrams.unsorted>, |
| 24 | +F<lang.word.numbers.unsorted>, F<lang.word.punc.unsorted>, and |
| 25 | +F<lang.wordlist.unsorted>, which (when sorted) can be used with |
| 26 | +C<wordlist2dawg> for Tesseract's language data. |
| 27 | +
|
| 28 | +The script can also run as a filter. Given a set of files created |
| 29 | +by WikiExtractor (L<http://medialab.di.unipi.it/Project/SemaWiki/Tools/WikiExtractor.py>), |
| 30 | +use: |
| 31 | +
|
| 32 | + find WikiExtractor -type f | while read i; do \ |
| 33 | + pfx=$(echo $i|tr '/' '_'); cat $i | \ |
| 34 | + perl genwordlists.pl -d OUTDIR -p $pfx; done |
| 35 | +
|
| 36 | +This will create a set of output files to match each of the files |
| 37 | +WikiExtractor created. |
| 38 | +
|
| 39 | +To combine these files: |
| 40 | +
|
| 41 | + for i in word.bigrams.unsorted word.numbers.unsorted \ |
| 42 | + word.punc.unsorted wordlist.unsorted; do \ |
| 43 | + find OUTDIR -name "*$i" -exec cat '{}' \; |\ |
| 44 | + perl -CS -ane 'BEGIN{my %c=();} chomp; |
| 45 | + my($a,$b)=split/\t/;if(defined $c{$a}){$c{$a}+=$b} |
| 46 | + else {$c{$a} = $b;} END{while(my($k,$v)=each %c) |
| 47 | + {print "$v\t$k\n";}}'|sort -nr > tmp.$i ;done |
| 48 | +
|
| 49 | +Followed by: |
| 50 | +
|
| 51 | + for i in word.punc.unsorted word.bigrams.unsorted \ |
| 52 | + word.numbers.unsorted;do cat tmp.$i \ |
| 53 | + awk -F'\t' '{print $2 "\t" $1}' > real.$i ; done |
| 54 | + cat tmp.wordlist.unsorted | awk -F'\t' '{print $2}' \ |
| 55 | + > real.wordlist.unsorted |
| 56 | +
|
| 57 | +Note that, although the langdata repository contains the |
| 58 | +counts of each item in most of the punctuation, number, and |
| 59 | +bigram files, these files must be filtered to only contain |
| 60 | +the first column, otherwise C<wordlist2dawg> will fail to write |
| 61 | +the output file. |
| 62 | +
|
| 63 | +=head1 CAVEATS |
| 64 | +
|
| 65 | +The format of the output files, and how the data are extracted, |
| 66 | +is based only on staring at the input files and taking a guess. |
| 67 | +They may be wildly inaccurate. |
| 68 | +
|
| 69 | +The only part I can say for certain is correct is that digits |
| 70 | +are replaced with '?' in the .numbers wordlist. (See F<dict/dict.cpp> |
| 71 | +in the Tesseract source). |
| 72 | +
|
| 73 | +=head1 COPYRIGHT |
| 74 | +
|
| 75 | +Copyright 2014 Jim O'Regan |
| 76 | +
|
| 77 | +Licensed under the Apache License, Version 2.0 (the "License"); |
| 78 | +you may not use this file except in compliance with the License. |
| 79 | +You may obtain a copy of the License at |
| 80 | +
|
| 81 | +L<http://www.apache.org/licenses/LICENSE-2.0> |
| 82 | +
|
| 83 | +Unless required by applicable law or agreed to in writing, software |
| 84 | +distributed under the License is distributed on an "AS IS" BASIS, |
| 85 | +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 86 | +See the License for the specific language governing permissions and |
| 87 | +limitations under the License. |
| 88 | +
|
| 89 | +=head1 SEE ALSO |
| 90 | +
|
| 91 | +L<wordlist2dawg(1)> |
| 92 | +
|
| 93 | +=cut |
| 94 | + |
| 95 | +# I haven't looked into this too much |
| 96 | +my %lig = ( |
| 97 | + # Longest first |
| 98 | + 'ffi' => 'ffi', |
| 99 | + 'ct' => "\N{U+E003}", |
| 100 | + 'ff' => 'ff', |
| 101 | + 'fi' => 'fi', |
| 102 | + 'fl' => 'fl', |
| 103 | + 'st' => 'st', |
| 104 | +); |
| 105 | + |
| 106 | +my %punct; |
| 107 | +my %num; |
| 108 | +my %bigrams; |
| 109 | +my %opts; |
| 110 | +my %words; |
| 111 | + |
| 112 | +my $do_ligatures = 0; |
| 113 | + |
| 114 | +getopts("hli:p:d:", \%opts); |
| 115 | + |
| 116 | +if (defined $opts{h}) { |
| 117 | + print "Usage: genwordlists [options]\n"; |
| 118 | + print "-h\tPrints a brief help message\n"; |
| 119 | + print "-d\tSet the output directory (default is current)\n"; |
| 120 | + print "-b\tSet the prefix for the language data (e.g., eng for English)\n"; |
| 121 | + print "-l\tProcess ligatures\n"; |
| 122 | + print "-i\tSet the input file. If not set, reads from stdin\n"; |
| 123 | + exit; |
| 124 | +} |
| 125 | + |
| 126 | +if (defined $opts{l}) { |
| 127 | + $do_ligatures = 1; |
| 128 | +} |
| 129 | + |
| 130 | +my $prefix = ''; |
| 131 | +if (!defined $opts{p}) { |
| 132 | + print "Prefix (-p) must be set!\n"; |
| 133 | + exit; |
| 134 | +} else { |
| 135 | + if (defined $opts{d}) { |
| 136 | + $prefix = $opts{d}; |
| 137 | + $prefix =~ s/\/$//; |
| 138 | + $prefix .= '/'; |
| 139 | + } |
| 140 | + $prefix .= $opts{p}; |
| 141 | + # Easiest is to drop it, if present, and readd |
| 142 | + $prefix =~ s/\.$//; |
| 143 | + $prefix .= "."; |
| 144 | +} |
| 145 | + |
| 146 | +my $input; |
| 147 | +if (defined $opts{i}) { |
| 148 | + open ($input, "<", $opts{i}) or die $!; |
| 149 | +#} elsif ($#ARGV > 0) { |
| 150 | +# open ($input, "<", $ARGV[0]) or die $!; |
| 151 | +} else { |
| 152 | + $input = *STDIN; |
| 153 | +} |
| 154 | +binmode $input, ":utf8"; |
| 155 | + |
| 156 | +while (<$input>) { |
| 157 | + chomp; |
| 158 | + tr/\t/ /; |
| 159 | + |
| 160 | + next if (/^<doc/); |
| 161 | + next if (/^<\/doc/); |
| 162 | + next if (/^$/); |
| 163 | + next if (/^[ \t]*$/); |
| 164 | + next if (/^\]\]$/); |
| 165 | + |
| 166 | + my @punct = $_ =~ /([ \p{Punct}]*)/g; |
| 167 | + for my $i (@punct) { |
| 168 | + if(defined($punct{$i})) { |
| 169 | + $punct{$i}++; |
| 170 | + } else { |
| 171 | + $punct{$i} = 1; |
| 172 | + } |
| 173 | + } |
| 174 | + my @rawnumtok = split(/ /); |
| 175 | + my @numtok = map { local $_ = $_; s/[0-9]/ /g; $_ } grep(/[0-9]/, @rawnumtok); |
| 176 | + for my $i (@numtok) { |
| 177 | + if(defined($num{$i})) { |
| 178 | + $num{$i}++; |
| 179 | + } else { |
| 180 | + $num{$i} = 1; |
| 181 | + } |
| 182 | + } |
| 183 | + |
| 184 | + my @bitoksraw = map { local $_ = $_; s/[0-9]/?/g; $_ } split(/ |[ \p{Punct}][ \p{Punct}]+/); |
| 185 | + if ($#bitoksraw > 0) { |
| 186 | + my @first = @bitoksraw; |
| 187 | + my $discard = shift @bitoksraw; |
| 188 | + for (my $j = 0; $j != $#first; $j++) { |
| 189 | + if ($bitoksraw[$j] ne '' && $first[$j] ne '') { |
| 190 | + my $tok = $first[$j] . " " . $bitoksraw[$j]; |
| 191 | + #Not keeping count of these, but this can be useful for trimming |
| 192 | + if(defined($bigrams{$tok})) { |
| 193 | + $bigrams{$tok}++; |
| 194 | + } else { |
| 195 | + $bigrams{$tok} = 1; |
| 196 | + } |
| 197 | + if($do_ligatures == 1) { |
| 198 | + my $other = do_lig($tok); |
| 199 | + if ($other ne $tok) { |
| 200 | + if(defined($bigrams{$other})) { |
| 201 | + $bigrams{$other}++; |
| 202 | + } else { |
| 203 | + $bigrams{$other} = 1; |
| 204 | + } |
| 205 | + } |
| 206 | + } |
| 207 | + } |
| 208 | + } |
| 209 | + } |
| 210 | + my @wordl = grep { !/[0-9 \p{Punct}]/ } split (/[ \p{Punct}]+/); |
| 211 | + if ($#wordl >= 0) { |
| 212 | + for my $word (@wordl) { |
| 213 | + if (defined $words{$word}) { |
| 214 | + $words{$word}++; |
| 215 | + } else { |
| 216 | + $words{$word} = 1; |
| 217 | + } |
| 218 | + } |
| 219 | + } |
| 220 | +} |
| 221 | + |
| 222 | +if (defined $opts{i}) { |
| 223 | + close $input; |
| 224 | +} |
| 225 | + |
| 226 | +open(BIGRAMS, ">", "${prefix}word.bigrams.unsorted"); |
| 227 | +binmode BIGRAMS, ":utf8"; |
| 228 | +while (my($k, $v) = each %bigrams) { |
| 229 | + print BIGRAMS "$k\t$v\n"; |
| 230 | +} |
| 231 | +close BIGRAMS; |
| 232 | +%bigrams = (); |
| 233 | + |
| 234 | +open(PUNCT, ">", "${prefix}word.punc.unsorted"); |
| 235 | +binmode PUNCT, ":utf8"; |
| 236 | +while (my($k, $v) = each %punct) { |
| 237 | + print PUNCT "$k\t$v\n"; |
| 238 | +} |
| 239 | +close PUNCT; |
| 240 | +%punct = (); |
| 241 | + |
| 242 | +open(NUMS, ">", "${prefix}word.numbers.unsorted"); |
| 243 | +binmode NUMS, ":utf8"; |
| 244 | +while (my($k, $v) = each %num) { |
| 245 | + print NUMS "$k\t$v\n"; |
| 246 | +} |
| 247 | +close NUMS; |
| 248 | +%num = (); |
| 249 | + |
| 250 | +open(WORDS, ">", "${prefix}wordlist.unsorted"); |
| 251 | +binmode WORDS, ":utf8"; |
| 252 | +while (my($k, $v) = each %words) { |
| 253 | + print WORDS "$k\t$v\n"; |
| 254 | +} |
| 255 | +close WORDS; |
| 256 | +%words = (); |
| 257 | + |
| 258 | +sub do_lig { |
| 259 | + my $word = shift; |
| 260 | + while (my($k, $v) = each %lig) { |
| 261 | + $word =~ s/$k/$v/g; |
| 262 | + } |
| 263 | + $word; |
| 264 | +} |
0 commit comments