add option for UNLV tests for spa

Shreeshrii · Shreeshrii · commit ea7f4801edba · 2018-06-08T14:28:50.000Z
diff --git a/unlvtests/README.md b/unlvtests/README.md
@@ -5,16 +5,18 @@ published in the Fourth Annual Test of OCR Accuracy.
 See http://www.isri.unlv.edu/downloads/AT-1995.pdf
 but first you have to get the tools and data used by  UNLV:
 
-Step 1: to download the images goto
+Step 1: to download the images go to
 https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/ 
 and get doe3.3B.tar.gz, bus.3B.tar.gz, mag.3B.tar.gz and news.3B.tar.gz
+spn.3B.tar.gz is incorrect in this repo, so get it from code.google
 
 mkdir -p ~/isri-downloads
 cd ~/isri-downloads
 curl  -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/bus.3B.tar.gz > bus.3B.tar.gz
 curl  -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/doe3.3B.tar.gz > doe3.3B.tar.gz
 curl  -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/mag.3B.tar.gz > mag.3B.tar.gz
 curl  -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/news.3B.tar.gz > news.3B.tar.gz
+curl  -L https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/isri-ocr-evaluation-tools/spn.3B.tar.gz > spn.3B.tar.gz
 
 Step 2: extract the files. It doesn't really matter where
 in your filesystem you put them, but they must go under a common
@@ -27,19 +29,27 @@ tar xzvf ~/isri-downloads/bus.3B.tar.gz
 tar xzvf ~/isri-downloads/doe3.3B.tar.gz
 tar xzvf ~/isri-downloads/mag.3B.tar.gz
 tar xzvf ~/isri-downloads/news.3B.tar.gz
+tar xzvf ~/isri-downloads/spn.3B.tar.gz
 
-Step 4: Download the modified ISRI toolkit from:
-https://ancientgreekocr.org/ocr-evaluation-tools.git
+**** Edit ~/ISRI-OCRtk/spn.3B/pages
+delete the line containing the following imagename as it crashes tesseract.
+7733_005.3B.tif
 
-make and install the tools in unlvtests/ocreval/bin by
-`make PREFIX=~/tesseract/unlvtests/ocreval install`
+Step 4: Download the modified ISRI toolkit and make and install the tools :
+
+git clone https://github.com/Shreeshrii/ocr-evaluation-tools.git
+cd ~/ocr-evaluation-tools
+sudo make install
 
 Step 6: cd back to your main tesseract-ocr dir and Build tesseract.
 
-Step 7: run unlvtests/runalltests.sh with the root ISRI data dir and testname:
-unlvtests/runalltests.sh ~/ISRI-OCRtk tess4.0.0-beta.1
+Step 7: run unlvtests/runalltests.sh with the root ISRI data dir and testname, tessdata-dir and language:
+
+unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_eng ../tessdata_fast eng
 and go to the gym, have lunch etc.
 
 Step 8: There should be a file
-unlvtests/reports/tess4.0.0-beta.1.summary that contains the final summarized accuracy
+unlvtests/reports/4-beta_fast.summary that contains the final summarized accuracy
 report and comparison with the 1995 results.
+
+unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_spa ../tessdata_fast spa
diff --git a/unlvtests/counttestset.sh b/unlvtests/counttestset.sh
@@ -43,17 +43,17 @@ do
   else
      srcdir="$imdir"
   fi
-echo "$srcdir/$page.tif"
+#echo "$srcdir/$page.tif"
   # Count character errors.
-  unlvtests/ocreval/bin/ocrevalutf8  unlvtests/ocreval/bin/accuracy "$srcdir/$page.txt" "$resdir/$page.unlv" "$resdir/$page.acc"
-   accfiles="$accfiles $resdir/$page.acc"
+  ocrevalutf8  accuracy "$srcdir/$page.txt" "$resdir/$page.unlv" > "$resdir/$page.acc"
+  accfiles="$accfiles $resdir/$page.acc"
   # Count word errors.
-    unlvtests/ocreval/bin/ocrevalutf8  unlvtests/ocreval/bin/wordacc "$srcdir/$page.txt" "$resdir/$page.unlv" "$resdir/$page.wa"
+  ocrevalutf8  wordacc "$srcdir/$page.txt" "$resdir/$page.unlv" > "$resdir/$page.wa"
   wafiles="$wafiles $resdir/$page.wa"
 done <"$pages"
 
-echo "$accfiles"
-echo "$wafiles"
+#echo "$accfiles"
+#echo "$wafiles"
 
-  unlvtests/ocreval/bin/accsum $accfiles >"unlvtests/reports/$setname.characc"
-  unlvtests/ocreval/bin/ocrevalutf8 unlvtests/ocreval/bin/wordaccsum $wafiles >"unlvtests/reports/$setname.wordacc"
+accsum $accfiles >"unlvtests/results/$setname.characc"
+wordaccsum $wafiles >"unlvtests/results/$setname.wordacc"
diff --git a/unlvtests/reports/1995.spn.3B.sum b/unlvtests/reports/1995.spn.3B.sum
@@ -0,0 +1 @@
+1995	spn.3B	100	95.00%	0.00%	100	95.00%	0.00%	100	95.00%	0.00% WAS NOT TESTED
diff --git a/unlvtests/runalltests.sh b/unlvtests/runalltests.sh
@@ -15,9 +15,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-if [ $# -ne 2 ]
+if [ $# -ne 4 ]
 then
-   echo "Usage:$0 unlv-data-dir version-id"
+   echo "Usage:$0 unlv-data-dir version-id tessdata-dir lang "
    exit 1
 fi
 if [ ! -d src/api ]
@@ -30,7 +30,8 @@ then
   echo "Please build tesseract before running $0"
   exit 1
 fi
-
+tessdata=$3
+lang=$4
 
 #deltapc new old calculates the %change from old to new
 deltapc() {
@@ -60,8 +61,20 @@ then
     bindir="./"
 fi
 rdir=unlvtests/reports
-#testsets="bus.3B doe3.3B mag.3B news.3B"
-testsets="bus.3B"
+
+if [ "$lang" = "eng" ]
+then
+    testsets="bus.3B doe3.3B mag.3B news.3B"
+    #testsets="bus.3B"
+else
+    if [ "$lang" = "spa" ]
+    then
+        testsets="spn.3B"
+    else
+        echo "Language has to be eng or spa"
+        exit 1
+    fi
+fi
 
 totalerrs=0
 totalwerrs=0
@@ -74,38 +87,38 @@ do
     if [ -r "$imdir/$set/pages" ]
     then
 	# Run tesseract on all the pages.
-	$bindir/runtestset.sh "$imdir/$set/pages"
+	$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "$lang"
 	# Count the errors on all the pages.
 	$bindir/counttestset.sh "$imdir/$set/pages"
 	# Get the old character word and nonstop word errors.
 	olderrs=$(cut -f3 "unlvtests/reports/1995.$set.sum")
 	oldwerrs=$(cut -f6 "unlvtests/reports/1995.$set.sum")
 	oldnswerrs=$(cut -f9 "unlvtests/reports/1995.$set.sum")
 	# Get the new character word and nonstop word errors and accuracy.
-	cherrs=$(head -4 "unlvtests/reports/$set.characc" |tail -1 |cut -c1-9 |
+	cherrs=$(head -4 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
 	    tr -d '[:blank:]')
-	chacc=$(head -5 "unlvtests/reports/$set.characc" |tail -1 |cut -c1-9 |
+	chacc=$(head -5 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
 	    tr -d '[:blank:]')
-	wderrs=$(head -4 "unlvtests/reports/$set.wordacc" |tail -1 |cut -c1-9 |
+	wderrs=$(head -4 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
 	    tr -d '[:blank:]')
-	wdacc=$(head -5 "unlvtests/reports/$set.wordacc" |tail -1 |cut -c1-9 |
+	wdacc=$(head -5 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
 	    tr -d '[:blank:]')
-	nswderrs=$(grep Total "unlvtests/reports/$set.wordacc" |head -2 |tail -1 |
+	nswderrs=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
 	    cut -c10-17 |tr -d '[:blank:]')
-	nswdacc=$(grep Total "unlvtests/reports/$set.wordacc" |head -2 |tail -1 |
+	nswdacc=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
 	    cut -c19-26 |tr -d '[:blank:]')
 	# Compute the percent change.
 	chdelta=$(deltapc "$cherrs" "$olderrs")
 	wdelta=$(deltapc "$wderrs" "$oldwerrs")
 	nswdelta=$(deltapc "$nswderrs" "$oldnswerrs")
 	sumfile=$rdir/$vid.$set.sum
-        if [ -r "unlvtests/reports/$set.times" ]
+        if [ -r "unlvtests/results/$set.times" ]
         then
-          total_time=$(timesum "unlvtests/reports/$set.times")
-          if [ -r "unlvtests/reports/prev/$set.times" ]
+          total_time=$(timesum "unlvtests/results/$set.times")
+          if [ -r "unlvtests/results/prev/$set.times" ]
           then
-            paste "unlvtests/reports/prev/$set.times" "unlvtests/reports/$set.times" |
-              awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >"unlvtests/reports/$set.timedelta"
+            paste "unlvtests/results/prev/$set.times" "unlvtests/results/$set.times" |
+              awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >"unlvtests/results/$set.timedelta"
           fi
 	else
           total_time='0.0'
@@ -129,3 +142,6 @@ tfile=$rdir/$vid.total.sum
 echo "$vid	Total	$totalerrs	-	$chdelta%	$totalwerrs\
 	-	$wdelta%	$totalnswerrs	-	$nswdelta%" >"$tfile"
 cat $rdir/1995.*.sum "$rdir/$vid".*.sum >"$rdir/$vid".summary
+
+mv "$rdir/$vid".*.sum unlvtests/results/
+cat "$rdir/$vid".summary
diff --git a/unlvtests/runtestset.sh b/unlvtests/runtestset.sh
@@ -15,9 +15,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-if [ $# -ne 1 ] && [ $# -ne 2 ]
+if  [ $# -ne 3 ] && [ $# -ne 4 ]
 then
-  echo "Usage:$0 pagesfile [-zoning]"
+  echo "Usage:$0 pagesfile tessdata-dir lang [-zoning]"
   exit 1
 fi
 if [ ! -d src/api ]
@@ -36,13 +36,15 @@ then
   fi
 else
   tess="time -f %U -o times.txt src/api/tesseract"
-  export TESSDATA_PREFIX=$PWD/
+  #tess="time -f %U -o times.txt tesseract"
 fi
 
+tessdata=$2
+lang=$3
 pages=$1
 imdir=${pages%/pages}
 setname=${imdir##*/}
-if [ $# -eq 2 ] && [ "$2" = "-zoning" ]
+if [ $# -eq 4 ] && [ "$4" = "-zoning" ]
 then
   config=unlv.auto
   resdir=unlvtests/results/zoning.$setname
@@ -52,7 +54,7 @@ else
 fi
 echo -e "Testing on set $setname in directory $imdir to $resdir\n"
 mkdir -p "$resdir"
-rm -f "unlvtests/reports/$setname.times"
+rm -f "unlvtests/results/$setname.times"
 while read page dir
 do
   # A pages file may be a list of files with subdirs or maybe just
@@ -64,11 +66,11 @@ do
      srcdir="$imdir"
   fi
 #  echo "$srcdir/$page.tif"
-  $tess "$srcdir/$page.tif" "$resdir/$page" --tessdata-dir ../tessdata_fast --oem 1 -l eng --psm 6 $config 2>&1 |grep -v "OCR Engine" |grep -v "Page 1"
+  $tess "$srcdir/$page.tif" "$resdir/$page" --tessdata-dir $tessdata --oem 1 -l $lang --psm 6 $config 2>&1 |grep -v "OCR Engine" |grep -v "Page 1"
   if [ -r times.txt ]
   then
     read t <times.txt
-    echo "$page $t" >>"unlvtests/reports/$setname.times"
+    echo "$page $t" >>"unlvtests/results/$setname.times"
     echo -e "\033M$page $t"
     if [ "$t" = "Command terminated by signal 2" ]
     then

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+1995 spn.3B 100 95.00% 0.00% 100 95.00% 0.00% 100 95.00% 0.00% WAS NOT TESTED`