Skip to content

Commit efd1bc4

Browse files
committed
ocr: improve fixOCRWordCapitalization()
1 parent f7b4e9e commit efd1bc4

File tree

2 files changed

+33
-13
lines changed

2 files changed

+33
-13
lines changed

filter_ocr.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,14 @@ func fixOCRWordCapitalization(s string) string {
6262

6363
// don't touch group of lowercase + uppercase such as in "macOS"
6464
cases := countCaseInLetters(s)
65-
if len(cases) < 4 {
65+
caseLen := len(cases)
66+
if caseLen >= 2 && cases[0].kind == upper && cases[1].kind == lower {
67+
// dont count "Word" as two groups of casings
68+
caseLen--
69+
}
70+
if caseLen <= 2 {
6671
return s
6772
}
68-
6973
if countUppercaseLetters(s) >= 2 {
7074
return strings.ToUpper(s)
7175
}
@@ -138,6 +142,7 @@ func getCase(c rune) caseType {
138142
return none
139143
}
140144

145+
// returns an array describing the casing order of the input string (see test cases)
141146
func countCaseInLetters(s string) []caseCount {
142147
res := []caseCount{}
143148
currentCount := 0

filter_ocr_test.go

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -87,21 +87,36 @@ func TestFilterOCRCapitalization(t *testing.T) {
8787
}
8888

8989
func TestFixOCRWordCapitalization(t *testing.T) {
90-
assert.Equal(t, "He's", fixOCRWordCapitalization("He's"))
91-
assert.Equal(t, "GASPS", fixOCRWordCapitalization("GAsPs"))
92-
93-
assert.Equal(t, "macOS", fixOCRWordCapitalization("macOS"))
94-
assert.Equal(t, "WindowsXP", fixOCRWordCapitalization("WindowsXP"))
90+
input := map[string]string{
91+
"GAsPs": "GASPS",
92+
"He's": "He's",
93+
"macOS": "macOS",
94+
"WindowsXP": "WindowsXP",
95+
}
96+
for in, out := range input {
97+
assert.Equal(t, out, fixOCRWordCapitalization(in))
98+
}
9599
}
96100

97101
func TestStartsWithUppercase(t *testing.T) {
98-
assert.Equal(t, true, startsWithUppercase("Allo"))
99-
assert.Equal(t, true, startsWithUppercase("Ällo"))
100-
assert.Equal(t, false, startsWithUppercase("allo"))
102+
input := map[string]bool{
103+
"Allo": true,
104+
"Ällo": true,
105+
"allo": false,
106+
}
107+
for in, out := range input {
108+
assert.Equal(t, out, startsWithUppercase(in))
109+
}
101110
}
102111

103112
func TestCountCaseInLetters(t *testing.T) {
104-
assert.Equal(t, []caseCount{{upper, 2}}, countCaseInLetters("GA"))
105-
assert.Equal(t, []caseCount{{lower, 2}}, countCaseInLetters("ga"))
106-
assert.Equal(t, []caseCount{{upper, 2}, {lower, 1}, {upper, 1}, {lower, 1}}, countCaseInLetters("GAsPs"))
113+
input := map[string][]caseCount{
114+
"HELLO": []caseCount{{upper, 5}},
115+
"hello": []caseCount{{lower, 5}},
116+
"Hello": []caseCount{{upper, 1}, {lower, 4}},
117+
"GAsPs": []caseCount{{upper, 2}, {lower, 1}, {upper, 1}, {lower, 1}},
118+
}
119+
for in, out := range input {
120+
assert.Equal(t, out, countCaseInLetters(in))
121+
}
107122
}

0 commit comments

Comments
 (0)