-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbuild_corpus.py
221 lines (182 loc) · 6.79 KB
/
build_corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
"""
NAME
===============================
Build Corpus (build_corpus.py)
BY
===============================
Matthew Blessing
LICENCE:
===============================
Code = MIT. See [README](https://github.com/MarkGotham/Hauptstimme/tree/main#licence)
ABOUT:
===============================
Build the OpenScore Orchestra corpus from the .mscz files.
For each file:
- Convert to .mxl
- Get compressed measure map
- Get Hauptstimme annotations file and 'melody score'
- Get lightweight score .csv
Then, create the rest of the metadata files from the finished
'sets.tsv' and 'composers.tsv' files.
"""
from __future__ import annotations
import os
from pathlib import Path
from hauptstimme.score_conversion import score_to_lightweight_df
from hauptstimme.metadata import *
from hauptstimme.utils import (
ms3_convert, get_corpus_files, get_compressed_measure_map_given_measures
)
from hauptstimme.annotations import get_annotations_and_melody_scores
from hauptstimme.part_relations import get_part_relationship_summary
from hauptstimme.alignment.score_audio_alignment import align_score_audios
from hauptstimme.constants import CORPUS_PATH
from typing import cast, List
def get_corpus_measure_maps():
"""
Get compressed measure maps for all scores in the corpus.
"""
# Get measures info for all scores
os.makedirs(".temp", exist_ok=True)
os.system(rf"ms3 extract -d '{CORPUS_PATH}' -a -i .*\.mscz -M " +
f"'{os.getcwd()}/.temp' -l c")
# Remove '.measures' from all filenames
for filename in os.listdir(".temp"):
new_filename = filename.replace(".measures", "")
old_file_path = os.path.join(".temp", filename)
new_file_path = os.path.join(".temp", new_filename)
# Rename the file
os.rename(old_file_path, new_file_path)
mscz_files = get_corpus_files(file_path="*.mscz", pathlib=True)
mscz_files = cast(List[Path], mscz_files)
for mscz_file in mscz_files:
measures_file = f".temp/{mscz_file.with_suffix('.tsv').name}"
get_compressed_measure_map_given_measures(
mscz_file, measures_file, verbose=False
)
os.system("rm -rf .temp")
def get_corpus_annotations_and_melody_scores():
"""
Get an annotation file and melody score for all scores in the
corpus.
"""
bach_path = CORPUS_PATH / "Bach,_Johann_Sebastian"
get_annotations_and_melody_scores(
f"{bach_path}/B_Minor_Mass,_BWV.232",
lyrics_not_text=False,
annotation_restrictions="[a-zA-Z]"
)
get_annotations_and_melody_scores(
f"{bach_path}/Brandenburg_Concerto_No.3,_BWV.1048"
)
get_annotations_and_melody_scores(
f"{bach_path}/Brandenburg_Concerto_No.4,_BWV.1049"
)
beach_path = CORPUS_PATH / "Beach,_Amy"
get_annotations_and_melody_scores(
beach_path,
annotation_restrictions=(
r"([a-zA-Z]([a-zA-Z]|('+|!))?)|([a-zA-Z]\+[a-zA-Z])|cad\.|trans"
)
)
beethoven_path = CORPUS_PATH / "Beethoven,_Ludwig_van"
get_annotations_and_melody_scores(
beethoven_path,
annotation_restrictions="([a-zA-Z]('+|!)?)|tr.?"
)
get_annotations_and_melody_scores(
f"{beethoven_path}/Symphony_No.9,_Op.125/4",
lyrics_not_text=False,
annotation_restrictions="([a-zA-Z]('+|!)?)|tr.?"
)
brahms_path = CORPUS_PATH / "Brahms,_Johannes"
get_annotations_and_melody_scores(
brahms_path,
annotation_restrictions="[a-zA-Z]'?"
)
get_annotations_and_melody_scores(
f"{brahms_path}/Ein_Deutsches_Requiem,_Op.45",
lyrics_not_text=False,
annotation_restrictions="[a-zA-Z]'?"
)
bruckner_path = CORPUS_PATH / "Bruckner,_Anton"
get_annotations_and_melody_scores(
bruckner_path,
annotation_restrictions="[a-zA-Z]'?"
)
def get_corpus_lightweight_scores():
"""
Get a lightweight score file for every score in the corpus.
"""
mxl_files = get_corpus_files(file_path="*.mxl", pathlib=True)
for mxl_file in mxl_files:
mxl_file = cast(Path, mxl_file)
if mxl_file.as_posix().endswith("_melody.mxl"):
pass
else:
mm_file = mxl_file.with_suffix(".mm.json")
score_to_lightweight_df(mxl_file, mm_file)
def get_corpus_part_relations():
"""
Get a part relationships summary for every score in the corpus.
"""
mscz_files = get_corpus_files(file_path="*.mscz", pathlib=True)
for mscz_file in mscz_files:
mscz_file = cast(Path, mscz_file)
mxl_file = mscz_file.with_suffix(".mxl")
lw_file = mscz_file.with_suffix(".csv")
annotations_file = (
mscz_file.parent / f"{mscz_file.stem}_annotations.csv"
)
df_summary = get_part_relationship_summary(
mxl_file, lw_file, annotations_file
)
csv_file = mscz_file.parent / f"{mscz_file.stem}_part_relations.csv"
df_summary.to_csv(csv_file, index=False)
def get_corpus_alignment_tables():
"""
Get an alignment for every score in the corpus with at least one
public domain/open license recording on IMSLP.
"""
mscz_files = get_corpus_files(file_path="*.mscz", pathlib=True)
audios = pd.read_csv(CORPUS_PATH / "audios.tsv", sep="\t")
scores = pd.read_csv(CORPUS_PATH / "scores.tsv", sep="\t")
for mscz_file in mscz_files:
mscz_file = cast(Path, mscz_file)
score_path = mscz_file.relative_to(CORPUS_PATH).parent.as_posix()
score_info = scores[scores["path"] == score_path]
score_audios = audios[audios["score_id"] == score_info["id"].item()]
if not score_audios.empty:
audio_files = []
for _, audio in score_audios.iterrows():
audio_file = [
audio["imslp_number"], audio["imslp_link"], None, None,
"full audio"
]
audio_files.append(audio_file)
mxl_file = mscz_file.with_suffix(".mxl")
mm_file = mscz_file.with_suffix(".mm.json")
align_score_audios(
mxl_file, mm_file, audio_files, out_dir=mscz_file.parent
)
if __name__ == "__main__":
# Convert all scores to MusicXML files
ms3_convert(CORPUS_PATH, "mscz", "mxl")
# Get compressed measure maps
get_corpus_measure_maps()
# Get annotations files and melody scores
get_corpus_annotations_and_melody_scores()
# Get lightweight score .csv files
get_corpus_lightweight_scores()
# Get part relationship summaries
get_corpus_part_relations()
# Get alignment tables
get_corpus_alignment_tables()
user_region = "EU"
# create_audio_metadata(user_region)
# create_score_metadata()
# match_audios_to_scores()
# Manual cleanup for score names and audio-score matching will be
# required
get_yaml_files()
make_contents()