-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathextract_text_from_docs.py
96 lines (78 loc) · 2.92 KB
/
extract_text_from_docs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import srn_docs_api
from io import StringIO
from bs4 import BeautifulSoup
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
def extract_text_from_srn_doc(fpath):
"""
Tries to extract text from a downloaded SRN document. Currently, only PDF
and HTML are supported and page-wise extraction (obviously) ony works with
PDF.
Args:
fpath (str): The file path to a locally downloaded SRN document.
Raises:
Exception: Whenever either PDF and HTML text extraction fail.
Returns:
[str]: A list of strings containing text for each parsed page.
"""
text = []
try:
with open(fpath, 'rb') as f:
parser = PDFParser(f)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
text = []
for page in PDFPage.create_pages(doc):
output_string = StringIO()
device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
interpreter.process_page(page)
text.append(output_string.getvalue())
except:
print("File does not parse as PDF - trying HTML")
if text == []:
try:
with open(fpath) as f:
soup = BeautifulSoup(f, 'html.parser')
is_html = bool(soup.find())
if is_html:
text = soup.get_text(' ', strip=True)
else:
raise Exception()
except:
print("File is also not parseable as HTML - giving up")
return text
if __name__ == "__main__":
import random
companies = srn_docs_api.get_srn_companies()
documents = srn_docs_api.get_srn_documents()
print("Searching comapny with a name containing 'Allianz'")
matches = [c for c in companies if 'Allianz' in c['name']]
print(
f"Found {len(matches)} match(es). " +
"Retrieving the documents for the first match."
)
docs = [d for d in documents if d['company_id'] == matches[0]['id']]
FPATH = 'test_srn_docs.pdf'
print(
f"Found {len(docs)} documents. " +
"Retrieving the first document from the list " +
f"and storing as '{FPATH}'."
)
srn_docs_api.download_document(docs[0]['id'], FPATH)
print(
"Parsing the text of the document into a page-wise list. " +
"This might take a while ..."
)
tlist = extract_text_from_srn_doc(FPATH)
if len(tlist) > 0:
p = random.randint(0, len(tlist) - 1)
print(
f"Parsed {len(tlist)} document pages. " +
f"This is the text for the randomly parsed page {p}:\n\n" +
tlist[p] + "\n\n"
)