Skip to content

Commit 198a1cb

Browse files
committed
ocr: normalize capitalization
1 parent 578c6a2 commit 198a1cb

File tree

2 files changed

+55
-0
lines changed

2 files changed

+55
-0
lines changed

filter_ocr.go

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package subtitles
22

33
import (
44
"strings"
5+
"unicode"
56

67
log "github.com/Sirupsen/logrus"
78
)
@@ -32,10 +33,45 @@ func (subtitle *Subtitle) filterOCR() *Subtitle {
3233
cap.Text[i] = strings.Replace(cap.Text[i], strings.Title(bad), strings.Title(good), -1)
3334
}
3435

36+
cap.Text[i] = fixOCRLineCapitalization(cap.Text[i])
3537
if org != cap.Text[i] {
3638
log.Println("[ocr]", org, "->", cap.Text[i])
3739
}
3840
}
3941
}
4042
return subtitle
4143
}
44+
45+
func fixOCRLineCapitalization(s string) string {
46+
words := strings.Split(s, " ")
47+
for i := range words {
48+
words[i] = fixOCRWordCapitalization(words[i])
49+
}
50+
return strings.Join(words, " ")
51+
}
52+
53+
// fix capitalization errors due to ocr, GAsPs => GASPS
54+
func fixOCRWordCapitalization(s string) string {
55+
if len(s) <= 3 {
56+
return s
57+
}
58+
59+
// if starts with uc, or at least 2 letters is upper, make all upper
60+
upper := 0
61+
ucStart := false
62+
for i, char := range s {
63+
if i == 0 && unicode.IsUpper(char) {
64+
ucStart = true
65+
}
66+
if unicode.IsUpper(char) {
67+
upper++
68+
}
69+
}
70+
if upper >= 2 {
71+
return strings.ToUpper(s)
72+
}
73+
if ucStart {
74+
return strings.Title(s)
75+
}
76+
return strings.ToLower(s)
77+
}

filter_ocr_test.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,3 +81,22 @@ func TestFilterOCREnglish(t *testing.T) {
8181

8282
assert.Equal(t, &expected, in.filterOCR())
8383
}
84+
85+
func TestFilterOCRCapitalization(t *testing.T) {
86+
87+
in := Subtitle{[]Caption{{
88+
1,
89+
makeTime(0, 0, 4, 630),
90+
makeTime(0, 0, 6, 18),
91+
[]string{"GAsPs slowly"},
92+
}}}
93+
94+
expected := Subtitle{[]Caption{{
95+
1,
96+
makeTime(0, 0, 4, 630),
97+
makeTime(0, 0, 6, 18),
98+
[]string{"GASPS slowly"},
99+
}}}
100+
101+
assert.Equal(t, &expected, in.filterOCR())
102+
}

0 commit comments

Comments
 (0)