Skip to content

Commit c647436

Browse files
committed
filter: improve ocr filter
1 parent a3379b9 commit c647436

File tree

3 files changed

+130
-23
lines changed

3 files changed

+130
-23
lines changed

cleaner.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ var (
4444
"seriessub",
4545
"addic7ed", "addicted.com", "vaioholics",
4646
"sdimedia", "sdi media",
47-
"allsubs.org", "hdbits.org", "bierdopje.com", "subcentral",
47+
"allsubs.org", "hdbits.org", "bierdopje.com", "subcentral", "mkvcage",
4848
"cssubs", "tvsub", "uksubtitles",
4949
"ragbear.com", "ydy.com", "yyets.net", "indivx.net", "sub-way.fr", "blogspot",
5050
"forom.com", "forom. com", "facebook.com", "hdvietnam.com", "sapo.pt", "softhome.net",
@@ -57,7 +57,7 @@ var (
5757
"thepiratebay", "anoxmous", "verdikt", "la fisher team", "red bee media",
5858
"mkv player", "best watched using", "advertise your product", "remove all ads",
5959
"memoryonsmells", "1st-booking",
60-
":[gwc]:", "ripped with subrip", "titra film",
60+
":[gwc]:", "ripped by", "ripped with subrip", "titra film",
6161

6262
// swedish:
6363
"swedish subtitles", "svenska undertexter", "internationella undertexter",

filter_ocr.go

Lines changed: 107 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package subtitles
33
import (
44
"strings"
55
"unicode"
6+
"unicode/utf8"
67

78
log "github.com/Sirupsen/logrus"
89
)
@@ -14,29 +15,32 @@ var (
1415
"0n ": "on ",
1516
"c0uld": "could",
1617
"s0mething": "something",
17-
"l've": "i've",
18+
"l've": "I've",
19+
"1 Oth": "10th",
1820
}
1921
)
2022

2123
// filterOCR corrects some OCR mistakes
2224
func (subtitle *Subtitle) filterOCR() *Subtitle {
2325
for _, cap := range subtitle.Captions {
2426
for i, org := range cap.Text {
27+
s := cap.Text[i]
2528
for bad, good := range ocrErrors {
2629
// lower case
27-
cap.Text[i] = strings.Replace(cap.Text[i], bad, good, -1)
30+
s = strings.Replace(s, bad, good, -1)
2831

2932
// upper case
30-
cap.Text[i] = strings.Replace(cap.Text[i], strings.ToUpper(bad), strings.ToUpper(good), -1)
33+
s = strings.Replace(s, strings.ToUpper(bad), strings.ToUpper(good), -1)
3134

3235
// ucfirst
33-
cap.Text[i] = strings.Replace(cap.Text[i], strings.Title(bad), strings.Title(good), -1)
36+
s = strings.Replace(s, strings.Title(bad), strings.Title(good), -1)
3437
}
3538

36-
cap.Text[i] = fixOCRLineCapitalization(cap.Text[i])
37-
if org != cap.Text[i] {
38-
log.Println("[ocr]", org, "->", cap.Text[i])
39+
s = fixOCRLineCapitalization(s)
40+
if org != s {
41+
log.Println("[ocr]", org, "->", s)
3942
}
43+
cap.Text[i] = s
4044
}
4145
}
4246
return subtitle
@@ -52,26 +56,109 @@ func fixOCRLineCapitalization(s string) string {
5256

5357
// fix capitalization errors due to ocr, GAsPs => GASPS
5458
func fixOCRWordCapitalization(s string) string {
55-
if len(s) <= 3 {
59+
if len(s) <= 3 || !isASCIIOnly(s) {
5660
return s
5761
}
5862

59-
// if starts with uc, or at least 2 letters is upper, make all upper
60-
upper := 0
61-
ucStart := false
62-
for i, char := range s {
63-
if i == 0 && unicode.IsUpper(char) {
64-
ucStart = true
65-
}
66-
if unicode.IsUpper(char) {
67-
upper++
68-
}
63+
// don't touch group of lowercase + uppercase such as in "macOS"
64+
cases := countCaseInLetters(s)
65+
if len(cases) < 4 {
66+
return s
6967
}
70-
if upper >= 2 {
68+
69+
if countUppercaseLetters(s) >= 2 {
7170
return strings.ToUpper(s)
7271
}
73-
if ucStart {
72+
if startsWithUppercase(s) {
7473
return strings.Title(s)
7574
}
7675
return strings.ToLower(s)
7776
}
77+
78+
func countUppercaseLetters(s string) int {
79+
upper := 0
80+
for _, c := range s {
81+
if unicode.IsUpper(c) {
82+
upper++
83+
}
84+
}
85+
return upper
86+
}
87+
88+
func countLowercaseLetters(s string) int {
89+
lower := 0
90+
for _, c := range s {
91+
if unicode.IsLower(c) {
92+
lower++
93+
}
94+
}
95+
return lower
96+
}
97+
98+
func startsWithUppercase(s string) bool {
99+
r, size := utf8.DecodeRuneInString(s)
100+
if r == utf8.RuneError {
101+
return false
102+
}
103+
if size > 0 && unicode.IsUpper(r) {
104+
return true
105+
}
106+
return false
107+
}
108+
109+
func isASCIIOnly(s string) bool {
110+
for _, c := range s {
111+
if (c < 'a' || c > 'z') && (c < 'A' || c > 'Z') {
112+
return false
113+
}
114+
}
115+
return true
116+
}
117+
118+
type caseCount struct {
119+
kind caseType
120+
n int
121+
}
122+
123+
type caseType int
124+
125+
const (
126+
none caseType = iota
127+
lower
128+
upper
129+
)
130+
131+
func getCase(c rune) caseType {
132+
if unicode.IsUpper(c) {
133+
return upper
134+
}
135+
if unicode.IsLower(c) {
136+
return lower
137+
}
138+
return none
139+
}
140+
141+
func countCaseInLetters(s string) []caseCount {
142+
res := []caseCount{}
143+
currentCount := 0
144+
lastCase := none
145+
for _, c := range s {
146+
currentCase := getCase(c)
147+
if lastCase == none {
148+
lastCase = currentCase
149+
}
150+
if lastCase != currentCase {
151+
if currentCount > 0 {
152+
res = append(res, caseCount{lastCase, currentCount})
153+
currentCount = 1
154+
lastCase = currentCase
155+
}
156+
} else {
157+
currentCount++
158+
}
159+
}
160+
if currentCount > 0 {
161+
res = append(res, caseCount{lastCase, currentCount})
162+
}
163+
return res
164+
}

filter_ocr_test.go

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ func TestFilterOCREnglish(t *testing.T) {
6565
1,
6666
makeTime(0, 0, 4, 630),
6767
makeTime(0, 0, 6, 18),
68-
[]string{"i've got a feeling"},
68+
[]string{"I've got a feeling"},
6969
}}}
7070
assert.Equal(t, &expected, in.filterOCR())
7171
}
@@ -85,3 +85,23 @@ func TestFilterOCRCapitalization(t *testing.T) {
8585
}}}
8686
assert.Equal(t, &expected, in.filterOCR())
8787
}
88+
89+
func TestFixOCRWordCapitalization(t *testing.T) {
90+
assert.Equal(t, "He's", fixOCRWordCapitalization("He's"))
91+
assert.Equal(t, "GASPS", fixOCRWordCapitalization("GAsPs"))
92+
93+
assert.Equal(t, "macOS", fixOCRWordCapitalization("macOS"))
94+
assert.Equal(t, "WindowsXP", fixOCRWordCapitalization("WindowsXP"))
95+
}
96+
97+
func TestStartsWithUppercase(t *testing.T) {
98+
assert.Equal(t, true, startsWithUppercase("Allo"))
99+
assert.Equal(t, true, startsWithUppercase("Ällo"))
100+
assert.Equal(t, false, startsWithUppercase("allo"))
101+
}
102+
103+
func TestCountCaseInLetters(t *testing.T) {
104+
assert.Equal(t, []caseCount{{upper, 2}}, countCaseInLetters("GA"))
105+
assert.Equal(t, []caseCount{{lower, 2}}, countCaseInLetters("ga"))
106+
assert.Equal(t, []caseCount{{upper, 2}, {lower, 1}, {upper, 1}, {lower, 1}}, countCaseInLetters("GAsPs"))
107+
}

0 commit comments

Comments
 (0)