Skip to content

Fix naive handling of relative links when proofing #75

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 3 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,6 @@ plugins:
- htmlproofer
```

To enable cross-page anchor validation, you must set `use_directory_urls = False` in `mkdocs.yml`:

```yaml
use_directory_urls: False
```

## Configuring

### `enabled`
Expand Down Expand Up @@ -147,11 +141,11 @@ plugins:

## Compatibility with `attr_list` extension

If you need to manually specify anchors make use of the `attr_list` [extension](https://python-markdown.github.io/extensions/attr_list) in the markdown.
If you need to manually specify anchors make use of the `attr_list` [extension](https://python-markdown.github.io/extensions/attr_list) in the markdown.
This can be useful for multilingual documentation to keep anchors as language neutral permalinks in all languages.

* A sample for a heading `# Grüße {#greetings}` (the slugified generated anchor `Gre` is overwritten with `greetings`).
* This also works for images `this is a nice image [](foo-bar.png){#nice-image}`
* This also works for images `this is a nice image [](foo-bar.png){#nice-image}`
* And generall for paragraphs:
```markdown
Listing: This is noteworthy.
Expand All @@ -164,4 +158,4 @@ More information about plugins in the [MkDocs documentation](http://www.mkdocs.o

## Acknowledgement

This work is based on the [mkdocs-markdownextradata-plugin](https://github.com/rosscdh/mkdocs-markdownextradata-plugin) project and the [Finding and Fixing Website Link Rot with Python, BeautifulSoup and Requests](https://www.twilio.com/en-us/blog/find-fix-website-link-rot-python-beautifulsoup-requests-html) article.
This work is based on the [mkdocs-markdownextradata-plugin](https://github.com/rosscdh/mkdocs-markdownextradata-plugin) project and the [Finding and Fixing Website Link Rot with Python, BeautifulSoup and Requests](https://www.twilio.com/en-us/blog/find-fix-website-link-rot-python-beautifulsoup-requests-html) article.
26 changes: 12 additions & 14 deletions htmlproofer/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,14 +98,13 @@ def on_post_page(self, output_content: str, page: Page, config: Config) -> None:
if not self.config['enabled']:
return

use_directory_urls = config.data["use_directory_urls"]

# Optimization: At this point, we have all the files, so we can create
# a dictionary for faster lookups. Prior to this point, files are
# still being updated so creating a dictionary before now would result
# in incorrect values appearing as the key.
opt_files = {}
opt_files.update({os.path.normpath(file.url): file for file in self.files})
opt_files.update({os.path.normpath(file.src_uri): file for file in self.files})

# Optimization: only parse links and headings
# li, sup are used for footnotes
Expand All @@ -123,7 +122,7 @@ def on_post_page(self, output_content: str, page: Page, config: Config) -> None:
if self.config['warn_on_ignored_urls']:
log_warning(f"ignoring URL {url} from {page.file.src_path}")
else:
url_status = self.get_url_status(url, page.file.src_path, all_element_ids, opt_files, use_directory_urls)
url_status = self.get_url_status(url, page.file.src_path, all_element_ids, opt_files)
if self.bad_url(url_status) and self.is_error(self.config, url, url_status):
self.report_invalid_url(url, url_status, page.file.src_path)

Expand Down Expand Up @@ -161,8 +160,7 @@ def get_url_status(
url: str,
src_path: str,
all_element_ids: Set[str],
files: Dict[str, File],
use_directory_urls: bool
files: Dict[str, File]
) -> int:
if any(pat.match(url) for pat in LOCAL_PATTERNS):
return 0
Expand All @@ -174,18 +172,13 @@ def get_url_status(
return 0
if fragment and not path:
return 0 if url[1:] in all_element_ids else 404
elif not use_directory_urls:
# use_directory_urls = True injects too many challenges for locating the correct target
# Markdown file, so disable target anchor validation in this case. Examples include:
# ../..#BAD_ANCHOR style links to index.html and extra ../ inserted into relative
# links.
else:
is_valid = self.is_url_target_valid(url, src_path, files)
url_status = 404
if not is_valid and self.is_error(self.config, url, url_status):
log_warning(f"Unable to locate source file for: {url}")
return url_status
return 0
return 0

@staticmethod
def is_url_target_valid(url: str, src_path: str, files: Dict[str, File]) -> bool:
Expand Down Expand Up @@ -225,9 +218,14 @@ def find_source_file(url: str, src_path: str, files: Dict[str, File]) -> Optiona
# Convert root/site paths
search_path = os.path.normpath(url[1:])
else:
# Handle relative links by concatenating the source dir with the destination path
src_dir = urllib.parse.quote(str(pathlib.Path(src_path).parent), safe='/\\')
search_path = os.path.normpath(str(pathlib.Path(src_dir) / pathlib.Path(url)))
# Handle relative links by looking up the destination url for the
# src_path and getting the parent directory.
try:
dest_uri = files[src_path].dest_uri
src_dir = urllib.parse.quote(str(pathlib.Path(dest_uri).parent), safe='/\\')
search_path = os.path.normpath(str(pathlib.Path(src_dir) / pathlib.Path(url)))
except KeyError:
return None

try:
return files[search_path]
Expand Down
4 changes: 2 additions & 2 deletions tests/integration/docs/nested/page1.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ you can either link to a local file without or with an anchor.
* [Main Page](../index.md)
* [Sub-Page](./page2.md)
* <figure markdown>
<a href="../assets/hello-world.drawio.svg">
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This needed to be changed as it isn't a valid test with directory urls on. mkdocs won't change the link in the raw href to be correct. only those in markdown.

<a href="/assets/hello-world.drawio.svg">
![Image](../assets/hello-world.drawio.svg)
</a>
</figure>
Expand All @@ -27,6 +27,6 @@ But allows valid anchors such as

## Image Link absolute/relative

<a href="../assets/hello-world.drawio.svg">![test](../assets/hello-world.drawio.svg)</a>
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This needed to be changed as it isn't a valid test with directory urls on. mkdocs won't change the link in the raw href to be correct. only those in markdown.

<a href="/assets/hello-world.drawio.svg">![test](../assets/hello-world.drawio.svg)</a>

<a href="/assets/hello-world.drawio.svg">![test](/assets/hello-world.drawio.svg)</a>
130 changes: 87 additions & 43 deletions tests/unit/test_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def test_on_post_page__plugin_disabled():
),
)
def test_get_url_status__ignore_local_servers(plugin, empty_files, url):
assert plugin.get_url_status(url, 'src/path.md', set(), empty_files, False) == 0
assert plugin.get_url_status(url, 'src/path.md', set(), empty_files) == 0


@pytest.mark.parametrize(
Expand All @@ -126,7 +126,7 @@ def test_get_url_status(validate_external: bool):
plugin.load_config({'validate_external_urls': validate_external})

def get_url():
return plugin.get_url_status('https://google.com', 'src/path.md', set(), empty_files, False)
return plugin.get_url_status('https://google.com', 'src/path.md', set(), empty_files)

if validate_external:
with pytest.raises(Exception):
Expand Down Expand Up @@ -172,9 +172,9 @@ def test_contains_anchor(plugin, markdown, anchor, expected):


def test_get_url_status__same_page_anchor(plugin, empty_files):
assert plugin.get_url_status('#ref', 'src/path.md', {'ref'}, empty_files, False) == 0
assert plugin.get_url_status('##ref', 'src/path.md', {'ref'}, empty_files, False) == 404
assert plugin.get_url_status('#ref', 'src/path.md', set(), empty_files, False) == 404
assert plugin.get_url_status('#ref', 'src/path.md', {'ref'}, empty_files) == 0
assert plugin.get_url_status('##ref', 'src/path.md', {'ref'}, empty_files) == 404
assert plugin.get_url_status('#ref', 'src/path.md', set(), empty_files) == 404


@pytest.mark.parametrize(
Expand All @@ -195,7 +195,7 @@ def test_get_url_status__external(plugin, empty_files, url):

with patch.object(HtmlProoferPlugin, "get_external_url") as mock_get_ext_url:
mock_get_ext_url.return_value = expected_status
status = plugin.get_url_status(url, src_path, set(), empty_files, False)
status = plugin.get_url_status(url, src_path, set(), empty_files)

mock_get_ext_url.assert_called_once_with(url, scheme, src_path)
assert status == expected_status
Expand Down Expand Up @@ -241,39 +241,64 @@ def test_get_url_status__local_page(plugin):
index_page = Mock(spec=Page, markdown='# Heading\nContent')
page1_page = Mock(spec=Page, markdown='# Page One\n## Sub Heading\nContent')
special_char_page = Mock(spec=Page, markdown='# Heading éèà\n## Sub Heading éèà\nContent')
files = {os.path.normpath(file.url): file for file in Files([
Mock(spec=File, src_path='index.md', dest_path='index.html', url='index.html', page=index_page),
Mock(spec=File, src_path='page1.md', dest_path='page1.html', url='page1.html', page=page1_page),
mock_files = Files([
Mock(spec=File, src_path='index.md', dest_path='index.html',
dest_uri='index.html', url='index.html', src_uri='index.md',
page=index_page),
Mock(spec=File, src_path='page1.md', dest_path='page1.html',
dest_uri='page1.html', url='page1.html', src_uri='page1.md',
page=page1_page),
Mock(spec=File, src_path='Dir éèà/éèà.md', dest_path='Dir éèà/éèà.html',
url='Dir%20%C3%A9%C3%A8%C3%A0/%C3%A9%C3%A8%C3%A0.html', page=special_char_page),
])}
dest_uri='Dir éèà/éèà.html',
url='Dir%20%C3%A9%C3%A8%C3%A0/%C3%A9%C3%A8%C3%A0.html',
src_uri='Dir éèà/éèà.md', page=special_char_page),
Mock(spec=File, src_path='Dir éèà/page1.md', dest_path='Dir éèà/page1.html',
dest_uri='Dir éèà/page1.html',
url='Dir%20%C3%A9%C3%A8%C3%A0/page1.html',
src_uri='Dir%20%C3%A9%C3%A8%C3%A0/page1.md',
page=special_char_page),
])
files = {}
files.update({os.path.normpath(file.url): file for file in mock_files})
files.update({file.src_uri: file for file in mock_files})

assert plugin.get_url_status('index.html', 'page1.md', set(), files, False) == 0
assert plugin.get_url_status('index.html#heading', 'page1.md', set(), files, False) == 0
assert plugin.get_url_status('index.html#bad-heading', 'page1.md', set(), files, False) == 404
assert plugin.get_url_status('index.html', 'page1.md', set(), files) == 0
assert plugin.get_url_status('index.html#heading', 'page1.md', set(), files) == 0
assert plugin.get_url_status('index.html#bad-heading', 'page1.md', set(), files) == 404

assert plugin.get_url_status('page1.html', 'page1.md', set(), files, False) == 0
assert plugin.get_url_status('page1.html#sub-heading', 'page1.md', set(), files, False) == 0
assert plugin.get_url_status('page1.html#heading', 'page1.md', set(), files, False) == 404
assert plugin.get_url_status('page1.html', 'page1.md', set(), files) == 0
assert plugin.get_url_status('page1.html#sub-heading', 'page1.md', set(), files) == 0
assert plugin.get_url_status('page1.html#heading', 'page1.md', set(), files) == 404

assert plugin.get_url_status('page2.html', 'page1.md', set(), files, False) == 404
assert plugin.get_url_status('page2.html#heading', 'page1.md', set(), files, False) == 404
assert plugin.get_url_status('page2.html', 'page1.md', set(), files) == 404
assert plugin.get_url_status('page2.html#heading', 'page1.md', set(), files) == 404

assert plugin.get_url_status('Dir%20%C3%A9%C3%A8%C3%A0/%C3%A9%C3%A8%C3%A0.html#sub-heading-eea',
'page1.md', set(), files, False) == 0
assert plugin.get_url_status('%C3%A9%C3%A8%C3%A0.html#sub-heading-eea', 'Dir éèà/page3.md', set(), files, False) == 0
assert plugin.get_url_status(
'Dir%20%C3%A9%C3%A8%C3%A0/%C3%A9%C3%A8%C3%A0.html#sub-heading-eea',
'page1.md', set(), files) == 0
assert plugin.get_url_status(
'%C3%A9%C3%A8%C3%A0.html#sub-heading-eea',
'Dir%20%C3%A9%C3%A8%C3%A0/page1.md',
set(), files) == 0


def test_get_url_status__non_markdown_page(plugin):
index_page = Mock(spec=Page, markdown='# Heading\nContent')
files = {os.path.normpath(file.url): file for file in Files([
Mock(spec=File, src_path='index.md', dest_path='index.html', url='index.html', page=index_page),
Mock(spec=File, src_path='drawing.svg', dest_path='drawing.svg', url='drawing.svg', page=None),
])}
mock_files = Files([
Mock(spec=File, src_path='index.md', dest_path='index.html',
dest_uri='index.html', url='index.html', src_uri='index.md',
page=index_page),
Mock(spec=File, src_path='drawing.svg', dest_path='drawing.svg',
dest_uri='index.html', url='drawing.svg', src_uri='drawing.svg',
page=None),
])
files = {}
files.update({os.path.normpath(file.url): file for file in mock_files})
files.update({file.src_uri: file for file in mock_files})

assert plugin.get_url_status('drawing.svg', 'index.md', set(), files, False) == 0
assert plugin.get_url_status('/drawing.svg', 'index.md', set(), files, False) == 0
assert plugin.get_url_status('not-existing.svg', 'index.md', set(), files, False) == 404
assert plugin.get_url_status('drawing.svg', 'index.md', set(), files) == 0
assert plugin.get_url_status('/drawing.svg', 'index.md', set(), files) == 0
assert plugin.get_url_status('not-existing.svg', 'index.md', set(), files) == 404


def test_get_url_status__local_page_nested(plugin):
Expand All @@ -282,48 +307,67 @@ def test_get_url_status__local_page_nested(plugin):
nested1_sibling_page = Mock(spec=Page, markdown='# Nested Sibling')
nested2_page = Mock(spec=Page, markdown='# Nested\n## Nested Two\nContent')
nested2_sibling_page = Mock(spec=Page, markdown='# Nested Sibling')
files = {os.path.normpath(file.url): file for file in Files([
Mock(spec=File, src_path='index.md', dest_path='index.html', url='index.html', page=index_page),
mock_files = Files([
Mock(
spec=File,
src_path='index.md',
dest_path='index.html',
dest_uri='index.html',
url='index.html',
src_uri='index.md',
page=index_page),
Mock(
spec=File,
src_path='foo/bar/nested.md',
dest_path='foo/bar/nested.html',
dest_uri='foo/bar/nested.html',
url='foo/bar/nested.html',
src_uri='foo/bar/nested.md',
page=nested1_page
),
Mock(
spec=File,
src_path='foo/bar/sibling.md',
dest_path='foo/bar/sibling.html',
dest_uri='foo/bar/sibling.html',
url='foo/bar/sibling.html',
src_uri='foo/bar/sibling.md',
page=nested1_sibling_page
),
Mock(
spec=File,
src_path='foo/baz/nested.md',
dest_path='foo/baz/nested.html',
dest_uri='foo/baz/nested.html',
url='foo/baz/nested.html',
src_uri='foo/baz/nested.md',
page=nested2_page
),
Mock(
spec=File,
src_path='foo/baz/sibling.md',
dest_path='foo/baz/sibling.html',
dest_uri='foo/baz/sibling.html',
url='foo/baz/sibling.html',
src_uri='foo/baz/sibling.md',
page=nested2_sibling_page
),
])}
])

files = {}
files.update({os.path.normpath(file.url): file for file in mock_files})
files.update({file.src_uri: file for file in mock_files})

assert plugin.get_url_status('nested.html#nested-one', 'foo/bar/sibling.md', set(), files, False) == 0
assert plugin.get_url_status('nested.html#nested-two', 'foo/bar/sibling.md', set(), files, False) == 404
assert plugin.get_url_status('nested.html#nested-one', 'foo/bar/sibling.md', set(), files) == 0
assert plugin.get_url_status('nested.html#nested-two', 'foo/bar/sibling.md', set(), files) == 404

assert plugin.get_url_status('nested.html#nested-two', 'foo/baz/sibling.md', set(), files, False) == 0
assert plugin.get_url_status('nested.html#nested-one', 'foo/baz/sibling.md', set(), files, False) == 404
assert plugin.get_url_status('nested.html#nested-two', 'foo/baz/sibling.md', set(), files) == 0
assert plugin.get_url_status('nested.html#nested-one', 'foo/baz/sibling.md', set(), files) == 404

assert plugin.get_url_status('foo/bar/nested.html#nested-one', 'index.md', set(), files, False) == 0
assert plugin.get_url_status('foo/baz/nested.html#nested-two', 'index.md', set(), files, False) == 0
assert plugin.get_url_status('foo/bar/nested.html#nested-one', 'index.md', set(), files) == 0
assert plugin.get_url_status('foo/baz/nested.html#nested-two', 'index.md', set(), files) == 0

assert plugin.get_url_status('/index.html', 'foo/baz/sibling.md', set(), files, False) == 0
assert plugin.get_url_status('/index.html', 'foo/baz/sibling.md', set(), files) == 0


@patch.object(htmlproofer.plugin, "log_warning", autospec=True)
Expand All @@ -334,7 +378,7 @@ def test_get_url_status__excluded_non_existing_relative_url__no_warning(log_warn
files = {}
plugin.config['raise_error_excludes'][url_status] = [url]

status = plugin.get_url_status(url, src_path, set(), files, False)
status = plugin.get_url_status(url, src_path, set(), files)

log_warning_mock.assert_not_called()
assert 0 == status
Expand All @@ -349,12 +393,12 @@ def test_get_url_status__excluded_existing_relative_url__no_warning(log_warning_
existing_page = Mock(spec=Page, markdown='')
files = {
os.path.normpath(file.url): file for file in Files([
Mock(spec=File, src_path=src_path, dest_path=url, url=url, page=existing_page)
Mock(spec=File, src_path=src_path, dest_path=url, dest_uri=url, url=url, src_uri=src_path, page=existing_page)
])
}
plugin.config['raise_error_excludes'][url_status] = [url]

status = plugin.get_url_status(url, src_path, set(), files, False)
status = plugin.get_url_status(url, src_path, set(), files)

log_warning_mock.assert_not_called()
assert 0 == status
Expand All @@ -367,7 +411,7 @@ def test_get_url_status__non_existing_relative_url__warning_and_404(log_warning_
src_path = "index.md"
files = {}

status = plugin.get_url_status(url, src_path, set(), files, False)
status = plugin.get_url_status(url, src_path, set(), files)

log_warning_mock.assert_called_once()
assert expected_url_status == status
Expand Down