Skip to content

Commit ea7f480

Browse files
committed
add option for UNLV tests for spa
1 parent a662306 commit ea7f480

File tree

5 files changed

+69
-40
lines changed

5 files changed

+69
-40
lines changed

unlvtests/README.md

+18-8
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,18 @@ published in the Fourth Annual Test of OCR Accuracy.
55
See http://www.isri.unlv.edu/downloads/AT-1995.pdf
66
but first you have to get the tools and data used by UNLV:
77

8-
Step 1: to download the images goto
8+
Step 1: to download the images go to
99
https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/
1010
and get doe3.3B.tar.gz, bus.3B.tar.gz, mag.3B.tar.gz and news.3B.tar.gz
11+
spn.3B.tar.gz is incorrect in this repo, so get it from code.google
1112

1213
mkdir -p ~/isri-downloads
1314
cd ~/isri-downloads
1415
curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/bus.3B.tar.gz > bus.3B.tar.gz
1516
curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/doe3.3B.tar.gz > doe3.3B.tar.gz
1617
curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/mag.3B.tar.gz > mag.3B.tar.gz
1718
curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/news.3B.tar.gz > news.3B.tar.gz
19+
curl -L https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/isri-ocr-evaluation-tools/spn.3B.tar.gz > spn.3B.tar.gz
1820

1921
Step 2: extract the files. It doesn't really matter where
2022
in your filesystem you put them, but they must go under a common
@@ -27,19 +29,27 @@ tar xzvf ~/isri-downloads/bus.3B.tar.gz
2729
tar xzvf ~/isri-downloads/doe3.3B.tar.gz
2830
tar xzvf ~/isri-downloads/mag.3B.tar.gz
2931
tar xzvf ~/isri-downloads/news.3B.tar.gz
32+
tar xzvf ~/isri-downloads/spn.3B.tar.gz
3033

31-
Step 4: Download the modified ISRI toolkit from:
32-
https://ancientgreekocr.org/ocr-evaluation-tools.git
34+
**** Edit ~/ISRI-OCRtk/spn.3B/pages
35+
delete the line containing the following imagename as it crashes tesseract.
36+
7733_005.3B.tif
3337

34-
make and install the tools in unlvtests/ocreval/bin by
35-
`make PREFIX=~/tesseract/unlvtests/ocreval install`
38+
Step 4: Download the modified ISRI toolkit and make and install the tools :
39+
40+
git clone https://github.com/Shreeshrii/ocr-evaluation-tools.git
41+
cd ~/ocr-evaluation-tools
42+
sudo make install
3643

3744
Step 6: cd back to your main tesseract-ocr dir and Build tesseract.
3845

39-
Step 7: run unlvtests/runalltests.sh with the root ISRI data dir and testname:
40-
unlvtests/runalltests.sh ~/ISRI-OCRtk tess4.0.0-beta.1
46+
Step 7: run unlvtests/runalltests.sh with the root ISRI data dir and testname, tessdata-dir and language:
47+
48+
unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_eng ../tessdata_fast eng
4149
and go to the gym, have lunch etc.
4250

4351
Step 8: There should be a file
44-
unlvtests/reports/tess4.0.0-beta.1.summary that contains the final summarized accuracy
52+
unlvtests/reports/4-beta_fast.summary that contains the final summarized accuracy
4553
report and comparison with the 1995 results.
54+
55+
unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_spa ../tessdata_fast spa

unlvtests/counttestset.sh

+8-8
Original file line numberDiff line numberDiff line change
@@ -43,17 +43,17 @@ do
4343
else
4444
srcdir="$imdir"
4545
fi
46-
echo "$srcdir/$page.tif"
46+
#echo "$srcdir/$page.tif"
4747
# Count character errors.
48-
unlvtests/ocreval/bin/ocrevalutf8 unlvtests/ocreval/bin/accuracy "$srcdir/$page.txt" "$resdir/$page.unlv" "$resdir/$page.acc"
49-
accfiles="$accfiles $resdir/$page.acc"
48+
ocrevalutf8 accuracy "$srcdir/$page.txt" "$resdir/$page.unlv" > "$resdir/$page.acc"
49+
accfiles="$accfiles $resdir/$page.acc"
5050
# Count word errors.
51-
unlvtests/ocreval/bin/ocrevalutf8 unlvtests/ocreval/bin/wordacc "$srcdir/$page.txt" "$resdir/$page.unlv" "$resdir/$page.wa"
51+
ocrevalutf8 wordacc "$srcdir/$page.txt" "$resdir/$page.unlv" > "$resdir/$page.wa"
5252
wafiles="$wafiles $resdir/$page.wa"
5353
done <"$pages"
5454

55-
echo "$accfiles"
56-
echo "$wafiles"
55+
#echo "$accfiles"
56+
#echo "$wafiles"
5757

58-
unlvtests/ocreval/bin/accsum $accfiles >"unlvtests/reports/$setname.characc"
59-
unlvtests/ocreval/bin/ocrevalutf8 unlvtests/ocreval/bin/wordaccsum $wafiles >"unlvtests/reports/$setname.wordacc"
58+
accsum $accfiles >"unlvtests/results/$setname.characc"
59+
wordaccsum $wafiles >"unlvtests/results/$setname.wordacc"

unlvtests/reports/1995.spn.3B.sum

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
1995 spn.3B 100 95.00% 0.00% 100 95.00% 0.00% 100 95.00% 0.00% WAS NOT TESTED

unlvtests/runalltests.sh

+33-17
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,9 @@
1515
# See the License for the specific language governing permissions and
1616
# limitations under the License.
1717

18-
if [ $# -ne 2 ]
18+
if [ $# -ne 4 ]
1919
then
20-
echo "Usage:$0 unlv-data-dir version-id"
20+
echo "Usage:$0 unlv-data-dir version-id tessdata-dir lang "
2121
exit 1
2222
fi
2323
if [ ! -d src/api ]
@@ -30,7 +30,8 @@ then
3030
echo "Please build tesseract before running $0"
3131
exit 1
3232
fi
33-
33+
tessdata=$3
34+
lang=$4
3435

3536
#deltapc new old calculates the %change from old to new
3637
deltapc() {
@@ -60,8 +61,20 @@ then
6061
bindir="./"
6162
fi
6263
rdir=unlvtests/reports
63-
#testsets="bus.3B doe3.3B mag.3B news.3B"
64-
testsets="bus.3B"
64+
65+
if [ "$lang" = "eng" ]
66+
then
67+
testsets="bus.3B doe3.3B mag.3B news.3B"
68+
#testsets="bus.3B"
69+
else
70+
if [ "$lang" = "spa" ]
71+
then
72+
testsets="spn.3B"
73+
else
74+
echo "Language has to be eng or spa"
75+
exit 1
76+
fi
77+
fi
6578

6679
totalerrs=0
6780
totalwerrs=0
@@ -74,38 +87,38 @@ do
7487
if [ -r "$imdir/$set/pages" ]
7588
then
7689
# Run tesseract on all the pages.
77-
$bindir/runtestset.sh "$imdir/$set/pages"
90+
$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "$lang"
7891
# Count the errors on all the pages.
7992
$bindir/counttestset.sh "$imdir/$set/pages"
8093
# Get the old character word and nonstop word errors.
8194
olderrs=$(cut -f3 "unlvtests/reports/1995.$set.sum")
8295
oldwerrs=$(cut -f6 "unlvtests/reports/1995.$set.sum")
8396
oldnswerrs=$(cut -f9 "unlvtests/reports/1995.$set.sum")
8497
# Get the new character word and nonstop word errors and accuracy.
85-
cherrs=$(head -4 "unlvtests/reports/$set.characc" |tail -1 |cut -c1-9 |
98+
cherrs=$(head -4 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
8699
tr -d '[:blank:]')
87-
chacc=$(head -5 "unlvtests/reports/$set.characc" |tail -1 |cut -c1-9 |
100+
chacc=$(head -5 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
88101
tr -d '[:blank:]')
89-
wderrs=$(head -4 "unlvtests/reports/$set.wordacc" |tail -1 |cut -c1-9 |
102+
wderrs=$(head -4 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
90103
tr -d '[:blank:]')
91-
wdacc=$(head -5 "unlvtests/reports/$set.wordacc" |tail -1 |cut -c1-9 |
104+
wdacc=$(head -5 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
92105
tr -d '[:blank:]')
93-
nswderrs=$(grep Total "unlvtests/reports/$set.wordacc" |head -2 |tail -1 |
106+
nswderrs=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
94107
cut -c10-17 |tr -d '[:blank:]')
95-
nswdacc=$(grep Total "unlvtests/reports/$set.wordacc" |head -2 |tail -1 |
108+
nswdacc=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
96109
cut -c19-26 |tr -d '[:blank:]')
97110
# Compute the percent change.
98111
chdelta=$(deltapc "$cherrs" "$olderrs")
99112
wdelta=$(deltapc "$wderrs" "$oldwerrs")
100113
nswdelta=$(deltapc "$nswderrs" "$oldnswerrs")
101114
sumfile=$rdir/$vid.$set.sum
102-
if [ -r "unlvtests/reports/$set.times" ]
115+
if [ -r "unlvtests/results/$set.times" ]
103116
then
104-
total_time=$(timesum "unlvtests/reports/$set.times")
105-
if [ -r "unlvtests/reports/prev/$set.times" ]
117+
total_time=$(timesum "unlvtests/results/$set.times")
118+
if [ -r "unlvtests/results/prev/$set.times" ]
106119
then
107-
paste "unlvtests/reports/prev/$set.times" "unlvtests/reports/$set.times" |
108-
awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >"unlvtests/reports/$set.timedelta"
120+
paste "unlvtests/results/prev/$set.times" "unlvtests/results/$set.times" |
121+
awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >"unlvtests/results/$set.timedelta"
109122
fi
110123
else
111124
total_time='0.0'
@@ -129,3 +142,6 @@ tfile=$rdir/$vid.total.sum
129142
echo "$vid Total $totalerrs - $chdelta% $totalwerrs\
130143
- $wdelta% $totalnswerrs - $nswdelta%" >"$tfile"
131144
cat $rdir/1995.*.sum "$rdir/$vid".*.sum >"$rdir/$vid".summary
145+
146+
mv "$rdir/$vid".*.sum unlvtests/results/
147+
cat "$rdir/$vid".summary

unlvtests/runtestset.sh

+9-7
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,9 @@
1515
# See the License for the specific language governing permissions and
1616
# limitations under the License.
1717

18-
if [ $# -ne 1 ] && [ $# -ne 2 ]
18+
if [ $# -ne 3 ] && [ $# -ne 4 ]
1919
then
20-
echo "Usage:$0 pagesfile [-zoning]"
20+
echo "Usage:$0 pagesfile tessdata-dir lang [-zoning]"
2121
exit 1
2222
fi
2323
if [ ! -d src/api ]
@@ -36,13 +36,15 @@ then
3636
fi
3737
else
3838
tess="time -f %U -o times.txt src/api/tesseract"
39-
export TESSDATA_PREFIX=$PWD/
39+
#tess="time -f %U -o times.txt tesseract"
4040
fi
4141

42+
tessdata=$2
43+
lang=$3
4244
pages=$1
4345
imdir=${pages%/pages}
4446
setname=${imdir##*/}
45-
if [ $# -eq 2 ] && [ "$2" = "-zoning" ]
47+
if [ $# -eq 4 ] && [ "$4" = "-zoning" ]
4648
then
4749
config=unlv.auto
4850
resdir=unlvtests/results/zoning.$setname
@@ -52,7 +54,7 @@ else
5254
fi
5355
echo -e "Testing on set $setname in directory $imdir to $resdir\n"
5456
mkdir -p "$resdir"
55-
rm -f "unlvtests/reports/$setname.times"
57+
rm -f "unlvtests/results/$setname.times"
5658
while read page dir
5759
do
5860
# A pages file may be a list of files with subdirs or maybe just
@@ -64,11 +66,11 @@ do
6466
srcdir="$imdir"
6567
fi
6668
# echo "$srcdir/$page.tif"
67-
$tess "$srcdir/$page.tif" "$resdir/$page" --tessdata-dir ../tessdata_fast --oem 1 -l eng --psm 6 $config 2>&1 |grep -v "OCR Engine" |grep -v "Page 1"
69+
$tess "$srcdir/$page.tif" "$resdir/$page" --tessdata-dir $tessdata --oem 1 -l $lang --psm 6 $config 2>&1 |grep -v "OCR Engine" |grep -v "Page 1"
6870
if [ -r times.txt ]
6971
then
7072
read t <times.txt
71-
echo "$page $t" >>"unlvtests/reports/$setname.times"
73+
echo "$page $t" >>"unlvtests/results/$setname.times"
7274
echo -e "\033M$page $t"
7375
if [ "$t" = "Command terminated by signal 2" ]
7476
then

0 commit comments

Comments
 (0)