|
19 | 19 | from sphinx.util.index_entries import split_index_msg
|
20 | 20 |
|
21 | 21 | if TYPE_CHECKING:
|
22 |
| - from collections.abc import Iterable |
| 22 | + from collections.abc import Callable, Iterable |
23 | 23 |
|
24 | 24 | from sphinx.environment import BuildEnvironment
|
25 | 25 |
|
@@ -525,47 +525,12 @@ def stem(word_to_stem: str) -> str:
|
525 | 525 | self._index_entries[docname] = sorted(_index_entries)
|
526 | 526 |
|
527 | 527 | def _word_collector(self, doctree: nodes.document) -> WordStore:
|
528 |
| - def _visit_nodes(node: nodes.Node) -> None: |
529 |
| - if isinstance(node, nodes.comment): |
530 |
| - return |
531 |
| - elif isinstance(node, nodes.raw): |
532 |
| - if 'html' in node.get('format', '').split(): |
533 |
| - # Some people might put content in raw HTML that should be searched, |
534 |
| - # so we just amateurishly strip HTML tags and index the remaining |
535 |
| - # content |
536 |
| - nodetext = re.sub( |
537 |
| - r'<style.*?</style>', |
538 |
| - '', |
539 |
| - node.astext(), |
540 |
| - flags=re.IGNORECASE | re.DOTALL, |
541 |
| - ) |
542 |
| - nodetext = re.sub( |
543 |
| - r'<script.*?</script>', |
544 |
| - '', |
545 |
| - nodetext, |
546 |
| - flags=re.IGNORECASE | re.DOTALL, |
547 |
| - ) |
548 |
| - nodetext = re.sub(r'<[^<]+?>', '', nodetext) |
549 |
| - word_store.words.extend(split(nodetext)) |
550 |
| - return |
551 |
| - elif isinstance(node, nodes.meta) and _is_meta_keywords(node, language): |
552 |
| - keywords = [keyword.strip() for keyword in node['content'].split(',')] |
553 |
| - word_store.words.extend(keywords) |
554 |
| - elif isinstance(node, nodes.Text): |
555 |
| - word_store.words.extend(split(node.astext())) |
556 |
| - elif isinstance(node, nodes.title): |
557 |
| - title, is_main_title = node.astext(), len(word_store.titles) == 0 |
558 |
| - ids = node.parent['ids'] |
559 |
| - title_node_id = None if is_main_title else ids[0] if ids else None |
560 |
| - word_store.titles.append((title, title_node_id)) |
561 |
| - word_store.title_words.extend(split(title)) |
562 |
| - for child in node.children: |
563 |
| - _visit_nodes(child) |
564 |
| - |
565 | 528 | word_store = WordStore()
|
566 | 529 | split = self.lang.split
|
567 | 530 | language = self.lang.lang
|
568 |
| - _visit_nodes(doctree) |
| 531 | + _feed_visit_nodes( |
| 532 | + doctree, word_store=word_store, split=split, language=language |
| 533 | + ) |
569 | 534 | return word_store
|
570 | 535 |
|
571 | 536 | def context_for_searchtool(self) -> dict[str, Any]:
|
@@ -611,3 +576,47 @@ def get_js_stemmer_code(self) -> str:
|
611 | 576 | )
|
612 | 577 | else:
|
613 | 578 | return self.lang.js_stemmer_code
|
| 579 | + |
| 580 | + |
| 581 | +def _feed_visit_nodes( |
| 582 | + node: nodes.Node, |
| 583 | + *, |
| 584 | + word_store: WordStore, |
| 585 | + split: Callable[[str], list[str]], |
| 586 | + language: str, |
| 587 | +) -> None: |
| 588 | + if isinstance(node, nodes.comment): |
| 589 | + return |
| 590 | + elif isinstance(node, nodes.raw): |
| 591 | + if 'html' in node.get('format', '').split(): |
| 592 | + # Some people might put content in raw HTML that should be searched, |
| 593 | + # so we just amateurishly strip HTML tags and index the remaining |
| 594 | + # content |
| 595 | + nodetext = re.sub( |
| 596 | + r'<style.*?</style>', |
| 597 | + '', |
| 598 | + node.astext(), |
| 599 | + flags=re.IGNORECASE | re.DOTALL, |
| 600 | + ) |
| 601 | + nodetext = re.sub( |
| 602 | + r'<script.*?</script>', |
| 603 | + '', |
| 604 | + nodetext, |
| 605 | + flags=re.IGNORECASE | re.DOTALL, |
| 606 | + ) |
| 607 | + nodetext = re.sub(r'<[^<]+?>', '', nodetext) |
| 608 | + word_store.words.extend(split(nodetext)) |
| 609 | + return |
| 610 | + elif isinstance(node, nodes.meta) and _is_meta_keywords(node, language): |
| 611 | + keywords = [keyword.strip() for keyword in node['content'].split(',')] |
| 612 | + word_store.words.extend(keywords) |
| 613 | + elif isinstance(node, nodes.Text): |
| 614 | + word_store.words.extend(split(node.astext())) |
| 615 | + elif isinstance(node, nodes.title): |
| 616 | + title, is_main_title = node.astext(), len(word_store.titles) == 0 |
| 617 | + ids = node.parent['ids'] |
| 618 | + title_node_id = None if is_main_title else ids[0] if ids else None |
| 619 | + word_store.titles.append((title, title_node_id)) |
| 620 | + word_store.title_words.extend(split(title)) |
| 621 | + for child in node.children: |
| 622 | + _feed_visit_nodes(child, word_store=word_store, split=split, language=language) |
0 commit comments