forked from aws/serverless-application-model
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_docs.py
executable file
·99 lines (80 loc) · 3.32 KB
/
parse_docs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/usr/bin/env python
"""
Script to parse a directory containing the AWS SAM documentation in Markdown
format (e.g. https://github.com/awsdocs/aws-sam-developer-guide/tree/main/doc_source).
Outputs in the docs.json format expected by the SAM JSON schema code (see
samtranslator/schema/schema.py).
Usage:
git clone https://github.com/awsdocs/aws-sam-developer-guide.git
bin/parse_docs.py aws-sam-developer-guide/doc_source > samtranslator/schema/docs.json
"""
import argparse
import json
import re
from pathlib import Path
from typing import Dict, Iterator, Tuple
def parse(s: str, cfn_docs: bool) -> Iterator[Tuple[str, str]]:
"""Parse an AWS docs page in Markdown format, yielding each property."""
parts = s.split("\n\n")
for part in parts:
# TODO: More robust matching against properties? This might skip or include wrong sections
sam_prop = not cfn_docs and part.startswith(" `")
cfn_prop = cfn_docs and re.match(r"`\w+` <a .+", part)
if sam_prop or cfn_prop:
name = part.split("`")[1]
yield name, part.strip()
# TODO: Change in the docs instead?
def fix_markdown_code_link(s: str) -> str:
"""Turns `[foo](bar)` into [`foo`](bar); the former doesn't display as a link."""
return re.sub(r"`\[(\w+)\]\(([^ ]+)\)`", r"[`\1`](\2)", s)
def remove_first_line(s: str) -> str:
try:
return s.split("\n", 1)[1]
except IndexError:
return ""
def convert_to_full_path(description: str, prefix: str) -> str:
pattern = re.compile("\(([#\.a-zA-Z0-9_-]+)\)")
matched_content = pattern.findall(description)
for path in matched_content:
if "https://docs.aws.amazon.com/" not in path:
url = path.split(".")[0] + ".html"
if "#" in path:
url += "#" + path.split("#")[1]
description = description.replace(path, prefix + url)
return description
def stringbetween(s: str, sep1: str, sep2: str) -> str:
return s.split(sep1, 1)[1].split(sep2, 1)[0]
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("dir", type=Path)
parser.add_argument("--cfn", action="store_true")
parser.add_argument("--with-title", help="use doc title instead of filename as key", action="store_true")
args = parser.parse_args()
props: Dict[str, Dict[str, str]] = {}
for path in args.dir.glob("*.md"):
text = path.read_text()
title = stringbetween(text, "# ", "<a")
page = title if args.with_title else path.stem
for name, description in parse(text, args.cfn):
if page not in props:
props[page] = {}
description = remove_first_line(description) # Remove property name; already in the schema title
description = fix_markdown_code_link(description)
prefix = (
"https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/"
if args.cfn
else "https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/"
)
description = convert_to_full_path(description, prefix)
props[page][name] = description
print(
json.dumps(
{
"properties": props,
},
indent=2,
sort_keys=True,
)
)
if __name__ == "__main__":
main()