Skip to content

Commit 44d69a1

Browse files
authored
fix(schema): improve docs parsing (#2850)
1 parent d2d6fd7 commit 44d69a1

File tree

6 files changed

+2272
-3546
lines changed

6 files changed

+2272
-3546
lines changed

Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ fetch-schema-data:
6161
update-schema-data:
6262
# Parse docs
6363
bin/parse_docs.py .tmp/aws-sam-developer-guide/doc_source > schema_source/docs.json
64-
bin/parse_docs.py --cfn --with-title .tmp/aws-cloudformation-user-guide/doc_source > schema_source/cloudformation-docs.json
64+
bin/parse_docs.py --cfn .tmp/aws-cloudformation-user-guide/doc_source > schema_source/cloudformation-docs.json
6565

6666
# Add CloudFormation docs to CloudFormation schema
6767
python bin/add_docs_cfn_schema.py --schema .tmp/cloudformation.schema.json --docs schema_source/cloudformation-docs.json > schema_source/cloudformation.schema.json

bin/parse_docs.py

+9-5
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,14 @@ def parse(s: str) -> Iterator[Tuple[str, str]]:
2222
"""Parse an AWS docs page in Markdown format, yielding each property."""
2323
# Prevent from parsing return values accidentally
2424
with suppress(ValueError):
25-
s = s[: s.index("Return Values")]
25+
s = s[: s.index("# Return Value")]
26+
with suppress(ValueError):
27+
s = s[: s.index("# Return value")]
2628
parts = s.split("\n\n")
2729
for part in parts:
2830
match = re.match(r"^\s*`(\w+)`\s+<a", part)
2931
if match:
30-
yield match.group(1), part.strip()
32+
yield match.group(1), part
3133

3234

3335
# TODO: Change in the docs instead?
@@ -64,14 +66,13 @@ def main() -> None:
6466
parser = argparse.ArgumentParser()
6567
parser.add_argument("dir", type=Path)
6668
parser.add_argument("--cfn", action="store_true")
67-
parser.add_argument("--with-title", help="use doc title instead of filename as key", action="store_true")
6869
args = parser.parse_args()
6970

7071
props: Dict[str, Dict[str, str]] = {}
7172
for path in args.dir.glob("*.md"):
7273
text = path.read_text()
7374
title = stringbetween(text, "# ", "<a")
74-
page = title if args.with_title else path.stem
75+
page = title if args.cfn else path.stem
7576
for name, description in parse(text):
7677
if page not in props:
7778
props[page] = {}
@@ -83,7 +84,10 @@ def main() -> None:
8384
else "https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/"
8485
)
8586
description = convert_to_full_path(description, prefix)
86-
props[page][name] = description
87+
# Assume properties (what we care about) at top, so skip if already exists
88+
if name in props[page]:
89+
continue
90+
props[page][name] = description.strip()
8791

8892
print(
8993
json.dumps(

0 commit comments

Comments
 (0)