@@ -3,6 +3,7 @@ package subtitles
3
3
import (
4
4
"strings"
5
5
"unicode"
6
+ "unicode/utf8"
6
7
7
8
log "github.com/Sirupsen/logrus"
8
9
)
@@ -14,29 +15,32 @@ var (
14
15
"0n " : "on " ,
15
16
"c0uld" : "could" ,
16
17
"s0mething" : "something" ,
17
- "l've" : "i've" ,
18
+ "l've" : "I've" ,
19
+ "1 Oth" : "10th" ,
18
20
}
19
21
)
20
22
21
23
// filterOCR corrects some OCR mistakes
22
24
func (subtitle * Subtitle ) filterOCR () * Subtitle {
23
25
for _ , cap := range subtitle .Captions {
24
26
for i , org := range cap .Text {
27
+ s := cap .Text [i ]
25
28
for bad , good := range ocrErrors {
26
29
// lower case
27
- cap . Text [ i ] = strings .Replace (cap . Text [ i ] , bad , good , - 1 )
30
+ s = strings .Replace (s , bad , good , - 1 )
28
31
29
32
// upper case
30
- cap . Text [ i ] = strings .Replace (cap . Text [ i ] , strings .ToUpper (bad ), strings .ToUpper (good ), - 1 )
33
+ s = strings .Replace (s , strings .ToUpper (bad ), strings .ToUpper (good ), - 1 )
31
34
32
35
// ucfirst
33
- cap . Text [ i ] = strings .Replace (cap . Text [ i ] , strings .Title (bad ), strings .Title (good ), - 1 )
36
+ s = strings .Replace (s , strings .Title (bad ), strings .Title (good ), - 1 )
34
37
}
35
38
36
- cap . Text [ i ] = fixOCRLineCapitalization (cap . Text [ i ] )
37
- if org != cap . Text [ i ] {
38
- log .Println ("[ocr]" , org , "->" , cap . Text [ i ] )
39
+ s = fixOCRLineCapitalization (s )
40
+ if org != s {
41
+ log .Println ("[ocr]" , org , "->" , s )
39
42
}
43
+ cap .Text [i ] = s
40
44
}
41
45
}
42
46
return subtitle
@@ -52,26 +56,109 @@ func fixOCRLineCapitalization(s string) string {
52
56
53
57
// fix capitalization errors due to ocr, GAsPs => GASPS
54
58
func fixOCRWordCapitalization (s string ) string {
55
- if len (s ) <= 3 {
59
+ if len (s ) <= 3 || ! isASCIIOnly ( s ) {
56
60
return s
57
61
}
58
62
59
- // if starts with uc, or at least 2 letters is upper, make all upper
60
- upper := 0
61
- ucStart := false
62
- for i , char := range s {
63
- if i == 0 && unicode .IsUpper (char ) {
64
- ucStart = true
65
- }
66
- if unicode .IsUpper (char ) {
67
- upper ++
68
- }
63
+ // don't touch group of lowercase + uppercase such as in "macOS"
64
+ cases := countCaseInLetters (s )
65
+ if len (cases ) < 4 {
66
+ return s
69
67
}
70
- if upper >= 2 {
68
+
69
+ if countUppercaseLetters (s ) >= 2 {
71
70
return strings .ToUpper (s )
72
71
}
73
- if ucStart {
72
+ if startsWithUppercase ( s ) {
74
73
return strings .Title (s )
75
74
}
76
75
return strings .ToLower (s )
77
76
}
77
+
78
+ func countUppercaseLetters (s string ) int {
79
+ upper := 0
80
+ for _ , c := range s {
81
+ if unicode .IsUpper (c ) {
82
+ upper ++
83
+ }
84
+ }
85
+ return upper
86
+ }
87
+
88
+ func countLowercaseLetters (s string ) int {
89
+ lower := 0
90
+ for _ , c := range s {
91
+ if unicode .IsLower (c ) {
92
+ lower ++
93
+ }
94
+ }
95
+ return lower
96
+ }
97
+
98
+ func startsWithUppercase (s string ) bool {
99
+ r , size := utf8 .DecodeRuneInString (s )
100
+ if r == utf8 .RuneError {
101
+ return false
102
+ }
103
+ if size > 0 && unicode .IsUpper (r ) {
104
+ return true
105
+ }
106
+ return false
107
+ }
108
+
109
+ func isASCIIOnly (s string ) bool {
110
+ for _ , c := range s {
111
+ if (c < 'a' || c > 'z' ) && (c < 'A' || c > 'Z' ) {
112
+ return false
113
+ }
114
+ }
115
+ return true
116
+ }
117
+
118
+ type caseCount struct {
119
+ kind caseType
120
+ n int
121
+ }
122
+
123
+ type caseType int
124
+
125
+ const (
126
+ none caseType = iota
127
+ lower
128
+ upper
129
+ )
130
+
131
+ func getCase (c rune ) caseType {
132
+ if unicode .IsUpper (c ) {
133
+ return upper
134
+ }
135
+ if unicode .IsLower (c ) {
136
+ return lower
137
+ }
138
+ return none
139
+ }
140
+
141
+ func countCaseInLetters (s string ) []caseCount {
142
+ res := []caseCount {}
143
+ currentCount := 0
144
+ lastCase := none
145
+ for _ , c := range s {
146
+ currentCase := getCase (c )
147
+ if lastCase == none {
148
+ lastCase = currentCase
149
+ }
150
+ if lastCase != currentCase {
151
+ if currentCount > 0 {
152
+ res = append (res , caseCount {lastCase , currentCount })
153
+ currentCount = 1
154
+ lastCase = currentCase
155
+ }
156
+ } else {
157
+ currentCount ++
158
+ }
159
+ }
160
+ if currentCount > 0 {
161
+ res = append (res , caseCount {lastCase , currentCount })
162
+ }
163
+ return res
164
+ }
0 commit comments