forked from dmmiller612/bert-extractive-summarizer
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathv1_pipeline.py
41 lines (34 loc) · 1.66 KB
/
v1_pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import os
import speech2text
from summarizer import Summarizer
def convert_mp4_to_audio_and_summarize_transcript(directory, file_path):
video_filename = file_path
audio_filename = video_filename[:-4] + ".wav"
os.system('ffmpeg -y -i {} -acodec pcm_s16le -ar 16000 {}'.format(video_filename, audio_filename))
txt2timestamp, timestamp2txt = speech2text.speech_recognize_continuous_from_file(audio_filename,
f'{directory}/secrets.json',
)
body = [k for k in txt2timestamp]
body = ' '.join(body)
raw_list = body.split('.')
model = Summarizer()
result = model(body, min_length=60)
brief = ''.join(result)
print(brief)
brief_list = brief.split('.')
brief_list = [f'{sentence}.' for sentence in brief_list]
print('original length: ', len(body))
print('summary length: ', len(brief))
print('original/summary: ', len(body)/len(brief))
print('original sentence count: ', len(raw_list))
print('summary sentence count: ', len(brief_list))
# azure tells me the timestamp of each utterance, not sentence, so I account for that here
timestamps = []
for sentence in brief_list:
for utterance in txt2timestamp.keys():
# utterance can be 1 or more sentences
if sentence in utterance:
timestamps.append(txt2timestamp[utterance])
timestamp_and_text = [{"timestamp": time, "text": text} for time, text in zip(timestamps, brief_list)]
# print(timestamp_and_text)
return [timestamp_and_text, body]