Skip to content

Commit 578c6a2

Browse files
committed
add ocr filter that detects and fixes some common ocr errors, activate using the 'ocr' filter, or the new 'all' filter
1 parent b96fd96 commit 578c6a2

File tree

4 files changed

+131
-1
lines changed

4 files changed

+131
-1
lines changed

cmd/subber/subber.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ var (
2020
keepAds = kingpin.Flag("keep-ads", "Do not strip advertisement captions.").Bool()
2121
skipBackups = kingpin.Flag("skip-backups", "Do not make backup (.srt.org) of original .srt").Bool()
2222
language = kingpin.Flag("language", "Language.").Default("en").String()
23-
filterName = kingpin.Flag("filter", "Filter (none, caps, html).").Default("none").String()
23+
filterName = kingpin.Flag("filter", "Filter (none, caps, html, ocr, all).").Default("none").String()
2424
sync = kingpin.Flag("sync", "Synchronize captions (milliseconds).").Int()
2525
)
2626

filter.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,16 @@ import (
77
// FilterCaptions pass the captions through a filter function
88
func (subtitle *Subtitle) FilterCaptions(filter string) {
99
switch filter {
10+
case "all":
11+
subtitle.filterCapitalization()
12+
subtitle.filterHTML()
13+
subtitle.filterOCR()
1014
case "caps":
1115
subtitle.filterCapitalization()
1216
case "html":
1317
subtitle.filterHTML()
18+
case "ocr":
19+
subtitle.filterOCR()
1420
case "none":
1521
default:
1622
fmt.Printf("Unrecognized filter name: %s\n", filter)

filter_ocr.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
package subtitles
2+
3+
import (
4+
"strings"
5+
6+
log "github.com/Sirupsen/logrus"
7+
)
8+
9+
var (
10+
ocrErrors = map[string]string{
11+
"s0 ": "so ",
12+
"g0 ": "go ",
13+
"0n ": "on ",
14+
"c0uld": "could",
15+
"s0mething": "something",
16+
"l've": "i've",
17+
}
18+
)
19+
20+
// filterOCR corrects some OCR mistakes
21+
func (subtitle *Subtitle) filterOCR() *Subtitle {
22+
for _, cap := range subtitle.Captions {
23+
for i, org := range cap.Text {
24+
for bad, good := range ocrErrors {
25+
// lower case
26+
cap.Text[i] = strings.Replace(cap.Text[i], bad, good, -1)
27+
28+
// upper case
29+
cap.Text[i] = strings.Replace(cap.Text[i], strings.ToUpper(bad), strings.ToUpper(good), -1)
30+
31+
// ucfirst
32+
cap.Text[i] = strings.Replace(cap.Text[i], strings.Title(bad), strings.Title(good), -1)
33+
}
34+
35+
if org != cap.Text[i] {
36+
log.Println("[ocr]", org, "->", cap.Text[i])
37+
}
38+
}
39+
}
40+
return subtitle
41+
}

filter_ocr_test.go

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
package subtitles
2+
3+
import (
4+
"testing"
5+
6+
"github.com/stretchr/testify/assert"
7+
)
8+
9+
func TestFilterOCRLower(t *testing.T) {
10+
11+
in := Subtitle{[]Caption{{
12+
1,
13+
makeTime(0, 0, 4, 630),
14+
makeTime(0, 0, 6, 18),
15+
[]string{"s0mething good"},
16+
}}}
17+
18+
expected := Subtitle{[]Caption{{
19+
1,
20+
makeTime(0, 0, 4, 630),
21+
makeTime(0, 0, 6, 18),
22+
[]string{"something good"},
23+
}}}
24+
25+
assert.Equal(t, &expected, in.filterOCR())
26+
}
27+
28+
func TestFilterOCRUpper(t *testing.T) {
29+
30+
in := Subtitle{[]Caption{{
31+
1,
32+
makeTime(0, 0, 4, 630),
33+
makeTime(0, 0, 6, 18),
34+
[]string{"S0METHING GOOD"},
35+
}}}
36+
37+
expected := Subtitle{[]Caption{{
38+
1,
39+
makeTime(0, 0, 4, 630),
40+
makeTime(0, 0, 6, 18),
41+
[]string{"SOMETHING GOOD"},
42+
}}}
43+
44+
assert.Equal(t, &expected, in.filterOCR())
45+
}
46+
47+
func TestFilterOCRUcFirst(t *testing.T) {
48+
49+
in := Subtitle{[]Caption{{
50+
1,
51+
makeTime(0, 0, 4, 630),
52+
makeTime(0, 0, 6, 18),
53+
[]string{"S0mething good"},
54+
}}}
55+
56+
expected := Subtitle{[]Caption{{
57+
1,
58+
makeTime(0, 0, 4, 630),
59+
makeTime(0, 0, 6, 18),
60+
[]string{"Something good"},
61+
}}}
62+
63+
assert.Equal(t, &expected, in.filterOCR())
64+
}
65+
66+
func TestFilterOCREnglish(t *testing.T) {
67+
68+
in := Subtitle{[]Caption{{
69+
1,
70+
makeTime(0, 0, 4, 630),
71+
makeTime(0, 0, 6, 18),
72+
[]string{"l've got a feeling"},
73+
}}}
74+
75+
expected := Subtitle{[]Caption{{
76+
1,
77+
makeTime(0, 0, 4, 630),
78+
makeTime(0, 0, 6, 18),
79+
[]string{"i've got a feeling"},
80+
}}}
81+
82+
assert.Equal(t, &expected, in.filterOCR())
83+
}

0 commit comments

Comments
 (0)