Initial commit

2024-08-27 20:33:44 +02:00
commit 1f1832267d
14794 changed files with 1599592 additions and 0 deletions
--- a/env/lib/python3.10/site-packages/wagtail/rich_text/rewriters.py
+++ b/env/lib/python3.10/site-packages/wagtail/rich_text/rewriters.py
@@ -0,0 +1,242 @@
+"""
+Utility classes for rewriting elements of HTML-like strings
+"""
+
+import re
+from collections import defaultdict
+from typing import Callable, Dict, List
+
+from django.utils.functional import cached_property
+
+FIND_A_TAG = re.compile(r"<a(\b[^>]*)>")
+FIND_EMBED_TAG = re.compile(r"<embed(\b[^>]*)/>")
+FIND_ATTRS = re.compile(r'([\w-]+)\="([^"]*)"')
+
+
+def extract_attrs(attr_string: str) -> dict:
+    """
+    helper method to extract tag attributes, as a dict of un-escaped strings
+    """
+    attributes = {}
+    for name, val in FIND_ATTRS.findall(attr_string):
+        val = (
+            val.replace("&lt;", "<")
+            .replace("&gt;", ">")
+            .replace("&quot;", '"')
+            .replace("&amp;", "&")
+        )
+        attributes[name] = val
+    return attributes
+
+
+class TagMatch:
+    """Represents a single matched tag in a rich text string"""
+
+    def __init__(self, match):
+        self.match = match  # a regexp match object
+        self.replacement = None  # to be filled in by the rewriter
+
+    @cached_property
+    def attrs(self):
+        return extract_attrs(self.match.group(1))
+
+    @property
+    def start(self):
+        return self.match.start()
+
+    @property
+    def end(self):
+        return self.match.end()
+
+
+class TagRewriter:
+    def __init__(self, rules=None, bulk_rules=None, reference_extractors=None):
+        self.rules = rules or {}
+        self.bulk_rules = bulk_rules or {}
+        self.reference_extractors = reference_extractors or {}
+
+    def get_opening_tag_regex(self):
+        raise NotImplementedError
+
+    def get_tag_type_from_attrs(self, attrs):
+        """Given a dict of attributes from a tag, return the tag type."""
+        raise NotImplementedError
+
+    def get_tag_replacements(self, tag_type, attrs_list):
+        """Given a list of attribute dicts, all taken from tags of the same type, return a
+        corresponding list of replacement strings to replace the tags with.
+
+        Return an empty list for cases when you don't want any replacements made.
+        """
+        raise NotImplementedError
+
+    def __call__(self, html: str) -> str:
+        matches_by_tag_type = self.extract_tags(html)
+        matches_to_replace = []
+
+        # For each tag type, get the list of replacement strings for all tags of that type
+        for tag_type, tag_matches in matches_by_tag_type.items():
+            attr_dicts = [match.attrs for match in tag_matches]
+            replacements = self.get_tag_replacements(tag_type, attr_dicts)
+
+            if not replacements:
+                continue
+
+            for match, replacement in zip(tag_matches, replacements):
+                match.replacement = replacement
+                matches_to_replace.append(match)
+
+        # Replace the tags in order of appearance in the string, so that offsets remain valid
+        matches_to_replace.sort(key=lambda match: match.start)
+
+        offset = 0
+        for match in matches_to_replace:
+            html = (
+                html[: match.start + offset]
+                + match.replacement
+                + html[match.end + offset :]
+            )
+
+            offset += len(match.replacement) - match.end + match.start
+
+        return html
+
+    def extract_tags(self, html: str) -> Dict[str, List[TagMatch]]:
+        """Helper method to extract and group HTML tags and their attributes.
+
+        Returns a dict of TagMatch objects, mapping tag types to a list of all TagMatch objects of that tag type.
+        """
+        matches_by_tag_type = defaultdict(list)
+
+        # Regex used to match <tag ...> tags in the HTML.
+        re_pattern = self.get_opening_tag_regex()
+
+        for re_match in re_pattern.finditer(html):
+            tag_match = TagMatch(re_match)
+            tag_type = self.get_tag_type_from_attrs(tag_match.attrs)
+
+            matches_by_tag_type[tag_type].append(tag_match)
+
+        return matches_by_tag_type
+
+    def convert_rule_to_bulk_rule(self, rule: Callable) -> Callable:
+        def bulk_rule(args):
+            return list(map(rule, args))
+
+        return bulk_rule
+
+    def extract_references(self, html):
+        re_pattern = self.get_opening_tag_regex()
+        for match in re_pattern.findall(html):
+            attrs = extract_attrs(match)
+            tag_type = self.get_tag_type_from_attrs(attrs)
+
+            if tag_type not in self.reference_extractors:
+                continue
+
+            yield from self.reference_extractors[tag_type](attrs)
+
+        return []
+
+
+class EmbedRewriter(TagRewriter):
+    """
+    Rewrites <embed embedtype="foo" /> tags within rich text into the HTML
+    fragment given by the embed rule for 'foo'. Each embed rule is a function
+    that takes a dict of attributes and returns the HTML fragment.
+    """
+
+    def get_opening_tag_regex(self):
+        return FIND_EMBED_TAG
+
+    def get_tag_type_from_attrs(self, attrs):
+        return attrs.get("embedtype")
+
+    def get_tag_replacements(self, tag_type, attrs_list):
+        try:
+            rule = self.bulk_rules[tag_type]
+        except KeyError:
+            rule = None
+
+        if not rule:
+            try:
+                rule = self.rules[tag_type]
+            except KeyError:
+                pass
+            else:
+                rule = self.convert_rule_to_bulk_rule(rule)
+
+        # Silently drop any tags with an unrecognised or missing embedtype attribute.
+        return rule(attrs_list) if rule else [""] * len(attrs_list)
+
+
+class LinkRewriter(TagRewriter):
+    """
+    Rewrites <a linktype="foo"> tags within rich text into the HTML fragment
+    given by the rule for 'foo'. Each link rule is a function that takes a dict
+    of attributes and returns the HTML fragment for the opening tag (only).
+    """
+
+    def get_opening_tag_regex(self):
+        return FIND_A_TAG
+
+    def get_tag_type_from_attrs(self, attrs):
+        try:
+            return attrs["linktype"]
+        except KeyError:
+            href = attrs.get("href", None)
+            if href:
+                # From href attribute we try to detect only the linktypes that we
+                # currently support (`external` & `email`, `page` has a default handler)
+                # from the link chooser.
+                if href.startswith(("http:", "https:")):
+                    return "external"
+                elif href.startswith("mailto:"):
+                    return "email"
+                elif href.startswith("#"):
+                    return "anchor"
+
+    def get_tag_replacements(self, tag_type, attrs_list):
+        if not tag_type:
+            # We want to leave links without a linktype attribute unchanged,
+            # for example <a name="important-anchor">, so we return an empty
+            # list here so that no tag replacements are made.
+            return []
+
+        try:
+            rule = self.bulk_rules[tag_type]
+        except KeyError:
+            rule = None
+
+        if not rule:
+            try:
+                rule = self.rules[tag_type]
+            except KeyError:
+                if tag_type in ["email", "external", "anchor"]:
+                    # We also want to leave links with certain known linktype
+                    # attributes alone even if there are no richtext rules
+                    # registered for those types, for example
+                    # <a href="https://wagtail.org">, so we return an empty
+                    # list here so that no tag replacements are made.
+                    return []
+            else:
+                rule = self.convert_rule_to_bulk_rule(rule)
+
+        # Replace unrecognised link types with an empty link.
+        return rule(attrs_list) if rule else ["<a>"] * len(attrs_list)
+
+
+class MultiRuleRewriter:
+    """Rewrites HTML by applying a sequence of rewriter functions"""
+
+    def __init__(self, rewriters):
+        self.rewriters = rewriters
+
+    def __call__(self, html):
+        for rewrite in self.rewriters:
+            html = rewrite(html)
+        return html
+
+    def extract_references(self, html):
+        for rewriter in self.rewriters:
+            yield from rewriter.extract_references(html)