Initial commit

2024-08-27 20:33:44 +02:00
commit 1f1832267d
14794 changed files with 1599592 additions and 0 deletions
--- a/env/lib/python3.10/site-packages/wagtail/rich_text/init.py
+++ b/env/lib/python3.10/site-packages/wagtail/rich_text/init.py
@@ -0,0 +1,183 @@
+import re
+from functools import lru_cache
+from html import unescape
+from typing import List
+
+from django.core.validators import MaxLengthValidator
+from django.db.models import Model
+from django.template.loader import render_to_string
+from django.utils.html import strip_tags
+from django.utils.safestring import mark_safe
+
+from wagtail.rich_text.feature_registry import FeatureRegistry
+from wagtail.rich_text.rewriters import EmbedRewriter, LinkRewriter, MultiRuleRewriter
+
+features = FeatureRegistry()
+
+
+# Rewriter function to be built up on first call to expand_db_html, using the utility classes
+# from wagtail.rich_text.rewriters along with the embed handlers / link handlers registered
+# with the feature registry
+
+
+@lru_cache(maxsize=None)
+def get_rewriter():
+    embed_rules = features.get_embed_types()
+    link_rules = features.get_link_types()
+    return MultiRuleRewriter(
+        [
+            LinkRewriter(
+                bulk_rules={
+                    linktype: handler.expand_db_attributes_many
+                    for linktype, handler in link_rules.items()
+                },
+                reference_extractors={
+                    linktype: handler.extract_references
+                    for linktype, handler in link_rules.items()
+                },
+            ),
+            EmbedRewriter(
+                bulk_rules={
+                    embedtype: handler.expand_db_attributes_many
+                    for embedtype, handler in embed_rules.items()
+                },
+                reference_extractors={
+                    embedtype: handler.extract_references
+                    for embedtype, handler in embed_rules.items()
+                },
+            ),
+        ]
+    )
+
+
+def expand_db_html(html):
+    """
+    Expand database-representation HTML into proper HTML usable on front-end templates
+    """
+    rewriter = get_rewriter()
+    return rewriter(html)
+
+
+def extract_references_from_rich_text(html):
+    rewriter = get_rewriter()
+    yield from rewriter.extract_references(html)
+
+
+def get_text_for_indexing(richtext):
+    """
+    Return a plain text version of a rich text string, suitable for search indexing;
+    like Django's strip_tags, but ensures that whitespace is left between block elements
+    so that <p>hello</p><p>world</p> gives "hello world", not "helloworld".
+    """
+    # insert space after </p>, </h1> - </h6>, </li> and </blockquote> tags
+    richtext = re.sub(
+        r"(</(p|h\d|li|blockquote)>)", r"\1 ", richtext, flags=re.IGNORECASE
+    )
+    # also insert space after <br /> and <hr />
+    richtext = re.sub(r"(<(br|hr)\s*/>)", r"\1 ", richtext, flags=re.IGNORECASE)
+    return unescape(strip_tags(richtext).strip())
+
+
+class RichText:
+    """
+    A custom object used to represent a renderable rich text value.
+    Provides a 'source' property to access the original source code,
+    and renders to the front-end HTML rendering.
+    Used as the native value of a wagtailcore.blocks.field_block.RichTextBlock.
+    """
+
+    def __init__(self, source):
+        self.source = source or ""
+
+    def __html__(self):
+        return render_to_string(
+            "wagtailcore/shared/richtext.html", {"html": expand_db_html(self.source)}
+        )
+
+    def __str__(self):
+        return mark_safe(self.__html__())
+
+    def __bool__(self):
+        return bool(self.source)
+
+    def __eq__(self, other):
+        if isinstance(other, RichText):
+            return self.source == other.source
+        return False
+
+
+class EntityHandler:
+    """
+    An 'entity' is a placeholder tag within the saved rich text, which needs to be rewritten
+    into real HTML at the point of rendering. Typically (but not necessarily) the entity will
+    be a reference to a model to be fetched to have its data output into the rich text content
+    (so that we aren't storing potentially changeable data within the saved rich text).
+
+    An EntityHandler defines how this rewriting is performed.
+
+    Currently Wagtail supports two kinds of entity: links (represented as <a linktype="...">...</a>)
+    and embeds (represented as <embed embedtype="..." />).
+    """
+
+    @staticmethod
+    def get_model():
+        """
+        If supported, returns the type of model able to be handled by this handler, e.g. Page.
+        """
+        raise NotImplementedError
+
+    @classmethod
+    def get_instance(cls, attrs: dict) -> Model:
+        model = cls.get_model()
+        return model._default_manager.get(id=attrs["id"])
+
+    @classmethod
+    def get_many(cls, attrs_list: List[dict]) -> List[Model]:
+        model = cls.get_model()
+        instance_ids = [attrs.get("id") for attrs in attrs_list]
+        instances_by_id = model._default_manager.in_bulk(instance_ids)
+        instances_by_str_id = {str(k): v for k, v in instances_by_id.items()}
+        return [instances_by_str_id.get(str(id_)) for id_ in instance_ids]
+
+    @staticmethod
+    def expand_db_attributes(attrs: dict) -> str:
+        """
+        Given a dict of attributes from the entity tag
+        stored in the database, returns the real HTML representation.
+        """
+        raise NotImplementedError
+
+    @classmethod
+    def expand_db_attributes_many(cls, attrs_list: List[dict]) -> List[str]:
+        """
+        Given a list of attribute dicts from a list of entity tags stored in
+        the database, return the real HTML representation of each one.
+        """
+        return list(map(cls.expand_db_attributes, attrs_list))
+
+    @classmethod
+    def extract_references(cls, attrs):
+        """
+        Yields a sequence of (content_type_id, object_id, model_path, content_path) tuples for the
+        database objects referenced by this entity, as per
+        wagtail.models.ReferenceIndex._extract_references_from_object
+        """
+        return []
+
+
+class LinkHandler(EntityHandler):
+    pass
+
+
+class EmbedHandler(EntityHandler):
+    pass
+
+
+class RichTextMaxLengthValidator(MaxLengthValidator):
+    """
+    A variant of MaxLengthValidator that only counts text (not HTML tags) towards the limit
+    Un-escapes entities for consistency with client-side character count.
+    """
+
+    def clean(self, x):
+        return len(unescape(strip_tags(x)))
--- a/env/lib/python3.10/site-packages/wagtail/rich_text/pycache/init.cpython-310.pyc
+++ b/env/lib/python3.10/site-packages/wagtail/rich_text/pycache/init.cpython-310.pyc
--- a/env/lib/python3.10/site-packages/wagtail/rich_text/pycache/feature_registry.cpython-310.pyc
+++ b/env/lib/python3.10/site-packages/wagtail/rich_text/pycache/feature_registry.cpython-310.pyc
--- a/env/lib/python3.10/site-packages/wagtail/rich_text/pycache/pages.cpython-310.pyc
+++ b/env/lib/python3.10/site-packages/wagtail/rich_text/pycache/pages.cpython-310.pyc
--- a/env/lib/python3.10/site-packages/wagtail/rich_text/pycache/rewriters.cpython-310.pyc
+++ b/env/lib/python3.10/site-packages/wagtail/rich_text/pycache/rewriters.cpython-310.pyc
--- a/env/lib/python3.10/site-packages/wagtail/rich_text/feature_registry.py
+++ b/env/lib/python3.10/site-packages/wagtail/rich_text/feature_registry.py
@@ -0,0 +1,111 @@
+from wagtail import hooks
+
+
+class FeatureRegistry:
+    """
+    A central store of information about optional features that can be enabled in rich text
+    editors by passing a ``features`` list to the RichTextField, such as how to
+    whitelist / convert HTML tags, and how to enable the feature on various editors.
+
+    This information may come from diverse sources - for example, wagtailimages might define
+    an 'images' feature and a Draftail plugin for it, while a third-party module might
+    define a TinyMCE plugin for the same feature. The information is therefore collected into
+    this registry via the 'register_rich_text_features' hook.
+    """
+
+    def __init__(self):
+        # Has the register_rich_text_features hook been run for this registry?
+        self.has_scanned_for_features = False
+
+        # a dict of dicts, one for each editor (draftail.js, TinyMCE etc); each dict is a mapping
+        # of feature names to 'plugin' objects that define how to implement that feature
+        # (e.g. paths to JS files to import). The API of that plugin object is not defined
+        # here, and is specific to each editor.
+        self.plugins_by_editor = {}
+
+        # a list of feature names that will be applied on rich text areas that do not specify
+        # an explicit `feature` list.
+        self.default_features = []
+
+        # a mapping of linktype names to rewriter functions for converting database representations
+        # of links (e.g. <a linktype="page" id="123">) into front-end HTML. Each rewriter function
+        # takes a dict of attributes, and returns the rewritten opening tag as a string
+        self.link_types = {}
+
+        # a mapping of embedtype names to rewriter functions for converting database representations
+        # of embedded content (e.g. <embed embedtype="image" id="123" format="left" alt="foo">)
+        # into front-end HTML. Each rewriter function takes a dict of attributes, and returns an
+        # HTML fragment to replace it with
+        self.embed_types = {}
+
+        # a dict of dicts, one for each converter backend (editorhtml, contentstate etc);
+        # each dict is a mapping of feature names to 'rule' objects that define how to convert
+        # that feature's elements between editor representation and database representation
+        # (e.g. elements to whitelist, functions for transferring attributes).
+        # The API of that rule object is not defined here, and is specific to each converter backend.
+        self.converter_rules_by_converter = {}
+
+    def get_default_features(self):
+        if not self.has_scanned_for_features:
+            self._scan_for_features()
+
+        return self.default_features
+
+    def _scan_for_features(self):
+        for fn in hooks.get_hooks("register_rich_text_features"):
+            fn(self)
+        self.has_scanned_for_features = True
+
+    def register_editor_plugin(self, editor_name, feature_name, plugin):
+        plugins = self.plugins_by_editor.setdefault(editor_name, {})
+        plugins[feature_name] = plugin
+
+    def get_editor_plugin(self, editor_name, feature_name):
+        if not self.has_scanned_for_features:
+            self._scan_for_features()
+
+        try:
+            return self.plugins_by_editor[editor_name][feature_name]
+        except KeyError:
+            return None
+
+    def register_link_type(self, handler):
+        self.link_types[handler.identifier] = handler
+
+    def get_link_types(self):
+        if not self.has_scanned_for_features:
+            self._scan_for_features()
+        return self.link_types
+
+    def register_embed_type(self, handler):
+        self.embed_types[handler.identifier] = handler
+
+    def get_embed_types(self):
+        if not self.has_scanned_for_features:
+            self._scan_for_features()
+        return self.embed_types
+
+    def register_converter_rule(self, converter_name, feature_name, rule):
+        rules = self.converter_rules_by_converter.setdefault(converter_name, {})
+        rules[feature_name] = rule
+
+    def get_converter_rule(self, converter_name, feature_name):
+        if not self.has_scanned_for_features:
+            self._scan_for_features()
+
+        try:
+            return self.converter_rules_by_converter[converter_name][feature_name]
+        except KeyError:
+            return None
+
+    @staticmethod
+    def function_as_entity_handler(identifier, fn):
+        """Supports legacy registering of entity handlers as functions."""
+        return type(
+            "EntityHandlerRegisteredAsFunction",
+            (object,),
+            {
+                "identifier": identifier,
+                "expand_db_attributes": staticmethod(fn),
+            },
+        )
--- a/env/lib/python3.10/site-packages/wagtail/rich_text/pages.py
+++ b/env/lib/python3.10/site-packages/wagtail/rich_text/pages.py
@@ -0,0 +1,40 @@
+from typing import List
+
+from django.db.models import Model
+from django.utils.html import escape
+
+from wagtail.models import Page
+from wagtail.rich_text import LinkHandler
+
+
+class PageLinkHandler(LinkHandler):
+    identifier = "page"
+
+    @staticmethod
+    def get_model():
+        return Page
+
+    @classmethod
+    def get_many(cls, attrs_list: List[dict]) -> List[Model]:
+        # Override LinkHandler.get_many to reduce database queries through the
+        # use of PageQuerySet.specific() instead of QuerySet.in_bulk().
+        instance_ids = [attrs.get("id") for attrs in attrs_list]
+        qs = Page.objects.filter(id__in=instance_ids).defer_streamfields().specific()
+        pages_by_str_id = {str(page.id): page for page in qs}
+        return [pages_by_str_id.get(str(id_)) for id_ in instance_ids]
+
+    @classmethod
+    def expand_db_attributes(cls, attrs: dict) -> str:
+        return cls.expand_db_attributes_many([attrs])[0]
+
+    @classmethod
+    def expand_db_attributes_many(cls, attrs_list: List[dict]) -> List[str]:
+        return [
+            '<a href="%s">' % escape(page.localized.url) if page else "<a>"
+            for page in cls.get_many(attrs_list)
+        ]
+
+    @classmethod
+    def extract_references(self, attrs):
+        # Yields tuples of (content_type_id, object_id, model_path, content_path)
+        yield Page, attrs["id"], "", ""
--- a/env/lib/python3.10/site-packages/wagtail/rich_text/rewriters.py
+++ b/env/lib/python3.10/site-packages/wagtail/rich_text/rewriters.py
@@ -0,0 +1,242 @@
+"""
+Utility classes for rewriting elements of HTML-like strings
+"""
+
+import re
+from collections import defaultdict
+from typing import Callable, Dict, List
+
+from django.utils.functional import cached_property
+
+FIND_A_TAG = re.compile(r"<a(\b[^>]*)>")
+FIND_EMBED_TAG = re.compile(r"<embed(\b[^>]*)/>")
+FIND_ATTRS = re.compile(r'([\w-]+)\="([^"]*)"')
+
+
+def extract_attrs(attr_string: str) -> dict:
+    """
+    helper method to extract tag attributes, as a dict of un-escaped strings
+    """
+    attributes = {}
+    for name, val in FIND_ATTRS.findall(attr_string):
+        val = (
+            val.replace("&lt;", "<")
+            .replace("&gt;", ">")
+            .replace("&quot;", '"')
+            .replace("&amp;", "&")
+        )
+        attributes[name] = val
+    return attributes
+
+
+class TagMatch:
+    """Represents a single matched tag in a rich text string"""
+
+    def __init__(self, match):
+        self.match = match  # a regexp match object
+        self.replacement = None  # to be filled in by the rewriter
+
+    @cached_property
+    def attrs(self):
+        return extract_attrs(self.match.group(1))
+
+    @property
+    def start(self):
+        return self.match.start()
+
+    @property
+    def end(self):
+        return self.match.end()
+
+
+class TagRewriter:
+    def __init__(self, rules=None, bulk_rules=None, reference_extractors=None):
+        self.rules = rules or {}
+        self.bulk_rules = bulk_rules or {}
+        self.reference_extractors = reference_extractors or {}
+
+    def get_opening_tag_regex(self):
+        raise NotImplementedError
+
+    def get_tag_type_from_attrs(self, attrs):
+        """Given a dict of attributes from a tag, return the tag type."""
+        raise NotImplementedError
+
+    def get_tag_replacements(self, tag_type, attrs_list):
+        """Given a list of attribute dicts, all taken from tags of the same type, return a
+        corresponding list of replacement strings to replace the tags with.
+
+        Return an empty list for cases when you don't want any replacements made.
+        """
+        raise NotImplementedError
+
+    def __call__(self, html: str) -> str:
+        matches_by_tag_type = self.extract_tags(html)
+        matches_to_replace = []
+
+        # For each tag type, get the list of replacement strings for all tags of that type
+        for tag_type, tag_matches in matches_by_tag_type.items():
+            attr_dicts = [match.attrs for match in tag_matches]
+            replacements = self.get_tag_replacements(tag_type, attr_dicts)
+
+            if not replacements:
+                continue
+
+            for match, replacement in zip(tag_matches, replacements):
+                match.replacement = replacement
+                matches_to_replace.append(match)
+
+        # Replace the tags in order of appearance in the string, so that offsets remain valid
+        matches_to_replace.sort(key=lambda match: match.start)
+
+        offset = 0
+        for match in matches_to_replace:
+            html = (
+                html[: match.start + offset]
+                + match.replacement
+                + html[match.end + offset :]
+            )
+
+            offset += len(match.replacement) - match.end + match.start
+
+        return html
+
+    def extract_tags(self, html: str) -> Dict[str, List[TagMatch]]:
+        """Helper method to extract and group HTML tags and their attributes.
+
+        Returns a dict of TagMatch objects, mapping tag types to a list of all TagMatch objects of that tag type.
+        """
+        matches_by_tag_type = defaultdict(list)
+
+        # Regex used to match <tag ...> tags in the HTML.
+        re_pattern = self.get_opening_tag_regex()
+
+        for re_match in re_pattern.finditer(html):
+            tag_match = TagMatch(re_match)
+            tag_type = self.get_tag_type_from_attrs(tag_match.attrs)
+
+            matches_by_tag_type[tag_type].append(tag_match)
+
+        return matches_by_tag_type
+
+    def convert_rule_to_bulk_rule(self, rule: Callable) -> Callable:
+        def bulk_rule(args):
+            return list(map(rule, args))
+
+        return bulk_rule
+
+    def extract_references(self, html):
+        re_pattern = self.get_opening_tag_regex()
+        for match in re_pattern.findall(html):
+            attrs = extract_attrs(match)
+            tag_type = self.get_tag_type_from_attrs(attrs)
+
+            if tag_type not in self.reference_extractors:
+                continue
+
+            yield from self.reference_extractors[tag_type](attrs)
+
+        return []
+
+
+class EmbedRewriter(TagRewriter):
+    """
+    Rewrites <embed embedtype="foo" /> tags within rich text into the HTML
+    fragment given by the embed rule for 'foo'. Each embed rule is a function
+    that takes a dict of attributes and returns the HTML fragment.
+    """
+
+    def get_opening_tag_regex(self):
+        return FIND_EMBED_TAG
+
+    def get_tag_type_from_attrs(self, attrs):
+        return attrs.get("embedtype")
+
+    def get_tag_replacements(self, tag_type, attrs_list):
+        try:
+            rule = self.bulk_rules[tag_type]
+        except KeyError:
+            rule = None
+
+        if not rule:
+            try:
+                rule = self.rules[tag_type]
+            except KeyError:
+                pass
+            else:
+                rule = self.convert_rule_to_bulk_rule(rule)
+
+        # Silently drop any tags with an unrecognised or missing embedtype attribute.
+        return rule(attrs_list) if rule else [""] * len(attrs_list)
+
+
+class LinkRewriter(TagRewriter):
+    """
+    Rewrites <a linktype="foo"> tags within rich text into the HTML fragment
+    given by the rule for 'foo'. Each link rule is a function that takes a dict
+    of attributes and returns the HTML fragment for the opening tag (only).
+    """
+
+    def get_opening_tag_regex(self):
+        return FIND_A_TAG
+
+    def get_tag_type_from_attrs(self, attrs):
+        try:
+            return attrs["linktype"]
+        except KeyError:
+            href = attrs.get("href", None)
+            if href:
+                # From href attribute we try to detect only the linktypes that we
+                # currently support (`external` & `email`, `page` has a default handler)
+                # from the link chooser.
+                if href.startswith(("http:", "https:")):
+                    return "external"
+                elif href.startswith("mailto:"):
+                    return "email"
+                elif href.startswith("#"):
+                    return "anchor"
+
+    def get_tag_replacements(self, tag_type, attrs_list):
+        if not tag_type:
+            # We want to leave links without a linktype attribute unchanged,
+            # for example <a name="important-anchor">, so we return an empty
+            # list here so that no tag replacements are made.
+            return []
+
+        try:
+            rule = self.bulk_rules[tag_type]
+        except KeyError:
+            rule = None
+
+        if not rule:
+            try:
+                rule = self.rules[tag_type]
+            except KeyError:
+                if tag_type in ["email", "external", "anchor"]:
+                    # We also want to leave links with certain known linktype
+                    # attributes alone even if there are no richtext rules
+                    # registered for those types, for example
+                    # <a href="https://wagtail.org">, so we return an empty
+                    # list here so that no tag replacements are made.
+                    return []
+            else:
+                rule = self.convert_rule_to_bulk_rule(rule)
+
+        # Replace unrecognised link types with an empty link.
+        return rule(attrs_list) if rule else ["<a>"] * len(attrs_list)
+
+
+class MultiRuleRewriter:
+    """Rewrites HTML by applying a sequence of rewriter functions"""
+
+    def __init__(self, rewriters):
+        self.rewriters = rewriters
+
+    def __call__(self, html):
+        for rewrite in self.rewriters:
+            html = rewrite(html)
+        return html
+
+    def extract_references(self, html):
+        for rewriter in self.rewriters:
+            yield from rewriter.extract_references(html)