Skip to content

Commit 724b4b5

Browse files
committed
ccdb: handle obscure text-based format
1 parent 497af97 commit 724b4b5

14 files changed

+164
-61
lines changed

Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
install:
2+
go install -v ./...

ccdb.go

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
package subtitles
2+
3+
// an obscure (to me) text-based subtitle format, can have extension .cc or .txt
4+
// found in The Way We Live Now (2001) BBC TV 1.txt
5+
// STATUS: incomplete detection and support
6+
7+
import (
8+
"log"
9+
"strings"
10+
"time"
11+
)
12+
13+
func looksLikeCCDBCapture(s string) bool {
14+
return strings.Contains(s, "[SUBTITLE]")
15+
}
16+
17+
// NewFromCCDBCapture parses a ccdb capture text into []Caption, assumes s is a clean utf8 string
18+
func NewFromCCDBCapture(s string) (res Subtitle, err error) {
19+
rows := strings.Split(s, "\n")
20+
seq := 1
21+
caption := Caption{Seq: seq}
22+
parseText := false
23+
for rowNum, row := range rows {
24+
if len(row) > 1 && row[0] == '[' {
25+
continue
26+
}
27+
if parseText {
28+
// log.Println("TEXT:", row)
29+
if row == "\r" || row == "" {
30+
parseText = false // XXX until text = empty
31+
} else {
32+
row = strings.TrimSpace(row)
33+
if row != "" {
34+
caption.Text = append(caption.Text, row)
35+
}
36+
}
37+
if strings.Join(caption.Text, "") != "" {
38+
res.Captions = append(res.Captions, caption)
39+
seq++
40+
caption = Caption{Seq: seq}
41+
}
42+
} else if !parseText {
43+
if row == "" {
44+
if rowNum != len(rows)-1 {
45+
log.Println("NOTICE: ccdb seem to have reached end of valid stream at row", rowNum, "of", len(rows))
46+
}
47+
break
48+
}
49+
// log.Println("TIME:", row)
50+
parts := strings.SplitN(row, ",", 2)
51+
if len(parts) == 2 {
52+
caption.Start, _ = parseCCDBTime(parts[0])
53+
caption.End, _ = parseCCDBTime(parts[1])
54+
} else {
55+
log.Println("TIME seq", seq, ", input row", (rowNum + 1), "error:", row)
56+
}
57+
parseText = true
58+
}
59+
}
60+
return
61+
}
62+
63+
func parseCCDBTime(s string) (time.Time, error) {
64+
return parseTime(s)
65+
}

ccdb_test.go

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
package subtitles
2+
3+
import (
4+
"testing"
5+
6+
"github.com/stretchr/testify/assert"
7+
)
8+
9+
func TestNewFromCCDBCapture(t *testing.T) {
10+
11+
in := "[SUBTITLE]\r\n" +
12+
"[COLF]&HFFFFFF,[STYLE]no,[SIZE]10,[FONT]Arial\r\n" +
13+
"00:00:16.24,00:01:25.82\r\n" +
14+
"Whoa. \r\n" +
15+
"\r\n" +
16+
"00:01:31.45,00:01:33.62\r\n" +
17+
"Go on. Get out. \r\n" +
18+
"\r\n" +
19+
"00:01:33.62,00:01:33.65\r\n" +
20+
" \r\n" + // should disappear in the parsed captions
21+
"\r\n" +
22+
"00:01:33.65,00:01:34.81\r\n" +
23+
"Out! \r\n" +
24+
"\r\n"
25+
26+
expected := Subtitle{[]Caption{{
27+
1,
28+
makeTime(0, 0, 16, 24),
29+
makeTime(0, 1, 25, 82),
30+
[]string{"Whoa."},
31+
}, Caption{
32+
2,
33+
makeTime(0, 1, 31, 45),
34+
makeTime(0, 1, 33, 62),
35+
[]string{"Go on. Get out."},
36+
}, Caption{
37+
3,
38+
makeTime(0, 1, 33, 65),
39+
makeTime(0, 1, 34, 81),
40+
[]string{"Out!"},
41+
}}}
42+
43+
res, err := NewFromCCDBCapture(in)
44+
assert.Equal(t, nil, err)
45+
assert.Equal(t, expected, res)
46+
}
47+
48+
func TestParseCCDBTime(t *testing.T) {
49+
t1, _ := parseCCDBTime("00:00:16.24")
50+
assert.Equal(t, makeTime(0, 0, 16, 24), t1)
51+
}

cleaner_test.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ import (
77
)
88

99
func TestRemoveAds(t *testing.T) {
10-
1110
in := Subtitle{[]Caption{{
1211
1,
1312
makeTime(0, 0, 4, 630),
@@ -36,6 +35,5 @@ func TestRemoveAds(t *testing.T) {
3635
makeTime(0, 1, 11, 005),
3736
[]string{"No ninja!"},
3837
}}}
39-
4038
assert.Equal(t, &expected, in.RemoveAds())
4139
}

cmd/subber/subber.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ func cleanupSub(data []byte, filterName string, keepAds bool, sync int) (string,
9393
func action(inFileName string) error {
9494

9595
ext := path.Ext(inFileName)
96-
if ext == ".srt" {
96+
if subtitles.LooksLikeTextSubtitle(inFileName) {
9797
if !*dontTouch {
9898
parseAndWriteSubFile(inFileName, *filterName, *keepAds, *sync)
9999
}
@@ -103,7 +103,6 @@ func action(inFileName string) error {
103103
subFileName := inFileName[:len(inFileName)-len(ext)] + ".srt"
104104

105105
if fileExists(subFileName) {
106-
107106
verboseMessage("Subs found locally in", subFileName, ", skipping download")
108107

109108
if !*dontTouch {

encoding_test.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,11 @@ import (
88
)
99

1010
func TestLooksLikeLatin1(t *testing.T) {
11-
1211
assert.Equal(t, true, looksLikeLatin1([]byte("hall\xe5")))
1312
assert.Equal(t, false, looksLikeLatin1([]byte("hallå")))
1413
}
1514

1615
func TestReadFileAsUTF8(t *testing.T) {
17-
1816
f, err := os.Open("README.md")
1917
assert.Equal(t, nil, err)
2018

filter_caps_test.go

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,20 +7,17 @@ import (
77
)
88

99
func TestFilterCapitalization(t *testing.T) {
10-
1110
in := Subtitle{Captions: []Caption{{
1211
Seq: 1,
1312
Start: makeTime(0, 0, 4, 630),
1413
End: makeTime(0, 0, 6, 18),
1514
Text: []string{"GO NINJA!", "NINJA GO!"},
1615
}}}
17-
1816
expected := Subtitle{[]Caption{{
1917
1,
2018
makeTime(0, 0, 4, 630),
2119
makeTime(0, 0, 6, 18),
2220
[]string{"Go ninja!", "Ninja go!"},
2321
}}}
24-
2522
assert.Equal(t, &expected, in.filterCapitalization())
2623
}

filter_html_test.go

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,20 +7,17 @@ import (
77
)
88

99
func TestFilterHTML(t *testing.T) {
10-
1110
in := Subtitle{[]Caption{{
1211
1,
1312
makeTime(0, 0, 4, 630),
1413
makeTime(0, 0, 6, 18),
1514
[]string{"<b>GO NINJA!</b>", "NINJA&nbsp;GO!"},
1615
}}}
17-
1816
expected := Subtitle{[]Caption{{
1917
1,
2018
makeTime(0, 0, 4, 630),
2119
makeTime(0, 0, 6, 18),
2220
[]string{"GO NINJA!", "NINJA GO!"},
2321
}}}
24-
2522
assert.Equal(t, &expected, in.filterHTML())
2623
}

filter_ocr_test.go

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -7,96 +7,81 @@ import (
77
)
88

99
func TestFilterOCRLower(t *testing.T) {
10-
1110
in := Subtitle{[]Caption{{
1211
1,
1312
makeTime(0, 0, 4, 630),
1413
makeTime(0, 0, 6, 18),
1514
[]string{"s0mething good"},
1615
}}}
17-
1816
expected := Subtitle{[]Caption{{
1917
1,
2018
makeTime(0, 0, 4, 630),
2119
makeTime(0, 0, 6, 18),
2220
[]string{"something good"},
2321
}}}
24-
2522
assert.Equal(t, &expected, in.filterOCR())
2623
}
2724

2825
func TestFilterOCRUpper(t *testing.T) {
29-
3026
in := Subtitle{[]Caption{{
3127
1,
3228
makeTime(0, 0, 4, 630),
3329
makeTime(0, 0, 6, 18),
3430
[]string{"S0METHING GOOD"},
3531
}}}
36-
3732
expected := Subtitle{[]Caption{{
3833
1,
3934
makeTime(0, 0, 4, 630),
4035
makeTime(0, 0, 6, 18),
4136
[]string{"SOMETHING GOOD"},
4237
}}}
43-
4438
assert.Equal(t, &expected, in.filterOCR())
4539
}
4640

4741
func TestFilterOCRUcFirst(t *testing.T) {
48-
4942
in := Subtitle{[]Caption{{
5043
1,
5144
makeTime(0, 0, 4, 630),
5245
makeTime(0, 0, 6, 18),
5346
[]string{"S0mething good"},
5447
}}}
55-
5648
expected := Subtitle{[]Caption{{
5749
1,
5850
makeTime(0, 0, 4, 630),
5951
makeTime(0, 0, 6, 18),
6052
[]string{"Something good"},
6153
}}}
62-
6354
assert.Equal(t, &expected, in.filterOCR())
6455
}
6556

6657
func TestFilterOCREnglish(t *testing.T) {
67-
6858
in := Subtitle{[]Caption{{
6959
1,
7060
makeTime(0, 0, 4, 630),
7161
makeTime(0, 0, 6, 18),
7262
[]string{"l've got a feeling"},
7363
}}}
74-
7564
expected := Subtitle{[]Caption{{
7665
1,
7766
makeTime(0, 0, 4, 630),
7867
makeTime(0, 0, 6, 18),
7968
[]string{"i've got a feeling"},
8069
}}}
81-
8270
assert.Equal(t, &expected, in.filterOCR())
8371
}
8472

8573
func TestFilterOCRCapitalization(t *testing.T) {
86-
8774
in := Subtitle{[]Caption{{
8875
1,
8976
makeTime(0, 0, 4, 630),
9077
makeTime(0, 0, 6, 18),
9178
[]string{"GAsPs slowly"},
9279
}}}
93-
9480
expected := Subtitle{[]Caption{{
9581
1,
9682
makeTime(0, 0, 4, 630),
9783
makeTime(0, 0, 6, 18),
9884
[]string{"GASPS slowly"},
9985
}}}
100-
10186
assert.Equal(t, &expected, in.filterOCR())
10287
}

finder_thesubdb_test.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ import (
1111
)
1212

1313
func TestDownloadFromTheSubDb(t *testing.T) {
14-
1514
fileName := createZeroedTempFile(1024 * 1024 * 4)
1615
defer os.Remove(fileName)
1716

@@ -44,7 +43,6 @@ func subDbConformTest(t *testing.T, fileName string, expectedHash string) {
4443
}
4544

4645
func TestSubDbHashFromFile(t *testing.T) {
47-
4846
// NOTE for this to work, run "./hash-conformance-deps" to fetch needed files
4947

5048
// http://thesubdb.com/api/samples/dexter.mp4

0 commit comments

Comments
 (0)