Initial commit

This commit is contained in:
2024-08-27 20:33:44 +02:00
commit 1f1832267d
14794 changed files with 1599592 additions and 0 deletions

View File

@@ -0,0 +1,183 @@
import re
from functools import lru_cache
from html import unescape
from typing import List
from django.core.validators import MaxLengthValidator
from django.db.models import Model
from django.template.loader import render_to_string
from django.utils.html import strip_tags
from django.utils.safestring import mark_safe
from wagtail.rich_text.feature_registry import FeatureRegistry
from wagtail.rich_text.rewriters import EmbedRewriter, LinkRewriter, MultiRuleRewriter
features = FeatureRegistry()
# Rewriter function to be built up on first call to expand_db_html, using the utility classes
# from wagtail.rich_text.rewriters along with the embed handlers / link handlers registered
# with the feature registry
@lru_cache(maxsize=None)
def get_rewriter():
embed_rules = features.get_embed_types()
link_rules = features.get_link_types()
return MultiRuleRewriter(
[
LinkRewriter(
bulk_rules={
linktype: handler.expand_db_attributes_many
for linktype, handler in link_rules.items()
},
reference_extractors={
linktype: handler.extract_references
for linktype, handler in link_rules.items()
},
),
EmbedRewriter(
bulk_rules={
embedtype: handler.expand_db_attributes_many
for embedtype, handler in embed_rules.items()
},
reference_extractors={
embedtype: handler.extract_references
for embedtype, handler in embed_rules.items()
},
),
]
)
def expand_db_html(html):
"""
Expand database-representation HTML into proper HTML usable on front-end templates
"""
rewriter = get_rewriter()
return rewriter(html)
def extract_references_from_rich_text(html):
rewriter = get_rewriter()
yield from rewriter.extract_references(html)
def get_text_for_indexing(richtext):
"""
Return a plain text version of a rich text string, suitable for search indexing;
like Django's strip_tags, but ensures that whitespace is left between block elements
so that <p>hello</p><p>world</p> gives "hello world", not "helloworld".
"""
# insert space after </p>, </h1> - </h6>, </li> and </blockquote> tags
richtext = re.sub(
r"(</(p|h\d|li|blockquote)>)", r"\1 ", richtext, flags=re.IGNORECASE
)
# also insert space after <br /> and <hr />
richtext = re.sub(r"(<(br|hr)\s*/>)", r"\1 ", richtext, flags=re.IGNORECASE)
return unescape(strip_tags(richtext).strip())
class RichText:
"""
A custom object used to represent a renderable rich text value.
Provides a 'source' property to access the original source code,
and renders to the front-end HTML rendering.
Used as the native value of a wagtailcore.blocks.field_block.RichTextBlock.
"""
def __init__(self, source):
self.source = source or ""
def __html__(self):
return render_to_string(
"wagtailcore/shared/richtext.html", {"html": expand_db_html(self.source)}
)
def __str__(self):
return mark_safe(self.__html__())
def __bool__(self):
return bool(self.source)
def __eq__(self, other):
if isinstance(other, RichText):
return self.source == other.source
return False
class EntityHandler:
"""
An 'entity' is a placeholder tag within the saved rich text, which needs to be rewritten
into real HTML at the point of rendering. Typically (but not necessarily) the entity will
be a reference to a model to be fetched to have its data output into the rich text content
(so that we aren't storing potentially changeable data within the saved rich text).
An EntityHandler defines how this rewriting is performed.
Currently Wagtail supports two kinds of entity: links (represented as <a linktype="...">...</a>)
and embeds (represented as <embed embedtype="..." />).
"""
@staticmethod
def get_model():
"""
If supported, returns the type of model able to be handled by this handler, e.g. Page.
"""
raise NotImplementedError
@classmethod
def get_instance(cls, attrs: dict) -> Model:
model = cls.get_model()
return model._default_manager.get(id=attrs["id"])
@classmethod
def get_many(cls, attrs_list: List[dict]) -> List[Model]:
model = cls.get_model()
instance_ids = [attrs.get("id") for attrs in attrs_list]
instances_by_id = model._default_manager.in_bulk(instance_ids)
instances_by_str_id = {str(k): v for k, v in instances_by_id.items()}
return [instances_by_str_id.get(str(id_)) for id_ in instance_ids]
@staticmethod
def expand_db_attributes(attrs: dict) -> str:
"""
Given a dict of attributes from the entity tag
stored in the database, returns the real HTML representation.
"""
raise NotImplementedError
@classmethod
def expand_db_attributes_many(cls, attrs_list: List[dict]) -> List[str]:
"""
Given a list of attribute dicts from a list of entity tags stored in
the database, return the real HTML representation of each one.
"""
return list(map(cls.expand_db_attributes, attrs_list))
@classmethod
def extract_references(cls, attrs):
"""
Yields a sequence of (content_type_id, object_id, model_path, content_path) tuples for the
database objects referenced by this entity, as per
wagtail.models.ReferenceIndex._extract_references_from_object
"""
return []
class LinkHandler(EntityHandler):
pass
class EmbedHandler(EntityHandler):
pass
class RichTextMaxLengthValidator(MaxLengthValidator):
"""
A variant of MaxLengthValidator that only counts text (not HTML tags) towards the limit
Un-escapes entities for consistency with client-side character count.
"""
def clean(self, x):
return len(unescape(strip_tags(x)))

View File

@@ -0,0 +1,111 @@
from wagtail import hooks
class FeatureRegistry:
"""
A central store of information about optional features that can be enabled in rich text
editors by passing a ``features`` list to the RichTextField, such as how to
whitelist / convert HTML tags, and how to enable the feature on various editors.
This information may come from diverse sources - for example, wagtailimages might define
an 'images' feature and a Draftail plugin for it, while a third-party module might
define a TinyMCE plugin for the same feature. The information is therefore collected into
this registry via the 'register_rich_text_features' hook.
"""
def __init__(self):
# Has the register_rich_text_features hook been run for this registry?
self.has_scanned_for_features = False
# a dict of dicts, one for each editor (draftail.js, TinyMCE etc); each dict is a mapping
# of feature names to 'plugin' objects that define how to implement that feature
# (e.g. paths to JS files to import). The API of that plugin object is not defined
# here, and is specific to each editor.
self.plugins_by_editor = {}
# a list of feature names that will be applied on rich text areas that do not specify
# an explicit `feature` list.
self.default_features = []
# a mapping of linktype names to rewriter functions for converting database representations
# of links (e.g. <a linktype="page" id="123">) into front-end HTML. Each rewriter function
# takes a dict of attributes, and returns the rewritten opening tag as a string
self.link_types = {}
# a mapping of embedtype names to rewriter functions for converting database representations
# of embedded content (e.g. <embed embedtype="image" id="123" format="left" alt="foo">)
# into front-end HTML. Each rewriter function takes a dict of attributes, and returns an
# HTML fragment to replace it with
self.embed_types = {}
# a dict of dicts, one for each converter backend (editorhtml, contentstate etc);
# each dict is a mapping of feature names to 'rule' objects that define how to convert
# that feature's elements between editor representation and database representation
# (e.g. elements to whitelist, functions for transferring attributes).
# The API of that rule object is not defined here, and is specific to each converter backend.
self.converter_rules_by_converter = {}
def get_default_features(self):
if not self.has_scanned_for_features:
self._scan_for_features()
return self.default_features
def _scan_for_features(self):
for fn in hooks.get_hooks("register_rich_text_features"):
fn(self)
self.has_scanned_for_features = True
def register_editor_plugin(self, editor_name, feature_name, plugin):
plugins = self.plugins_by_editor.setdefault(editor_name, {})
plugins[feature_name] = plugin
def get_editor_plugin(self, editor_name, feature_name):
if not self.has_scanned_for_features:
self._scan_for_features()
try:
return self.plugins_by_editor[editor_name][feature_name]
except KeyError:
return None
def register_link_type(self, handler):
self.link_types[handler.identifier] = handler
def get_link_types(self):
if not self.has_scanned_for_features:
self._scan_for_features()
return self.link_types
def register_embed_type(self, handler):
self.embed_types[handler.identifier] = handler
def get_embed_types(self):
if not self.has_scanned_for_features:
self._scan_for_features()
return self.embed_types
def register_converter_rule(self, converter_name, feature_name, rule):
rules = self.converter_rules_by_converter.setdefault(converter_name, {})
rules[feature_name] = rule
def get_converter_rule(self, converter_name, feature_name):
if not self.has_scanned_for_features:
self._scan_for_features()
try:
return self.converter_rules_by_converter[converter_name][feature_name]
except KeyError:
return None
@staticmethod
def function_as_entity_handler(identifier, fn):
"""Supports legacy registering of entity handlers as functions."""
return type(
"EntityHandlerRegisteredAsFunction",
(object,),
{
"identifier": identifier,
"expand_db_attributes": staticmethod(fn),
},
)

View File

@@ -0,0 +1,40 @@
from typing import List
from django.db.models import Model
from django.utils.html import escape
from wagtail.models import Page
from wagtail.rich_text import LinkHandler
class PageLinkHandler(LinkHandler):
identifier = "page"
@staticmethod
def get_model():
return Page
@classmethod
def get_many(cls, attrs_list: List[dict]) -> List[Model]:
# Override LinkHandler.get_many to reduce database queries through the
# use of PageQuerySet.specific() instead of QuerySet.in_bulk().
instance_ids = [attrs.get("id") for attrs in attrs_list]
qs = Page.objects.filter(id__in=instance_ids).defer_streamfields().specific()
pages_by_str_id = {str(page.id): page for page in qs}
return [pages_by_str_id.get(str(id_)) for id_ in instance_ids]
@classmethod
def expand_db_attributes(cls, attrs: dict) -> str:
return cls.expand_db_attributes_many([attrs])[0]
@classmethod
def expand_db_attributes_many(cls, attrs_list: List[dict]) -> List[str]:
return [
'<a href="%s">' % escape(page.localized.url) if page else "<a>"
for page in cls.get_many(attrs_list)
]
@classmethod
def extract_references(self, attrs):
# Yields tuples of (content_type_id, object_id, model_path, content_path)
yield Page, attrs["id"], "", ""

View File

@@ -0,0 +1,242 @@
"""
Utility classes for rewriting elements of HTML-like strings
"""
import re
from collections import defaultdict
from typing import Callable, Dict, List
from django.utils.functional import cached_property
FIND_A_TAG = re.compile(r"<a(\b[^>]*)>")
FIND_EMBED_TAG = re.compile(r"<embed(\b[^>]*)/>")
FIND_ATTRS = re.compile(r'([\w-]+)\="([^"]*)"')
def extract_attrs(attr_string: str) -> dict:
"""
helper method to extract tag attributes, as a dict of un-escaped strings
"""
attributes = {}
for name, val in FIND_ATTRS.findall(attr_string):
val = (
val.replace("&lt;", "<")
.replace("&gt;", ">")
.replace("&quot;", '"')
.replace("&amp;", "&")
)
attributes[name] = val
return attributes
class TagMatch:
"""Represents a single matched tag in a rich text string"""
def __init__(self, match):
self.match = match # a regexp match object
self.replacement = None # to be filled in by the rewriter
@cached_property
def attrs(self):
return extract_attrs(self.match.group(1))
@property
def start(self):
return self.match.start()
@property
def end(self):
return self.match.end()
class TagRewriter:
def __init__(self, rules=None, bulk_rules=None, reference_extractors=None):
self.rules = rules or {}
self.bulk_rules = bulk_rules or {}
self.reference_extractors = reference_extractors or {}
def get_opening_tag_regex(self):
raise NotImplementedError
def get_tag_type_from_attrs(self, attrs):
"""Given a dict of attributes from a tag, return the tag type."""
raise NotImplementedError
def get_tag_replacements(self, tag_type, attrs_list):
"""Given a list of attribute dicts, all taken from tags of the same type, return a
corresponding list of replacement strings to replace the tags with.
Return an empty list for cases when you don't want any replacements made.
"""
raise NotImplementedError
def __call__(self, html: str) -> str:
matches_by_tag_type = self.extract_tags(html)
matches_to_replace = []
# For each tag type, get the list of replacement strings for all tags of that type
for tag_type, tag_matches in matches_by_tag_type.items():
attr_dicts = [match.attrs for match in tag_matches]
replacements = self.get_tag_replacements(tag_type, attr_dicts)
if not replacements:
continue
for match, replacement in zip(tag_matches, replacements):
match.replacement = replacement
matches_to_replace.append(match)
# Replace the tags in order of appearance in the string, so that offsets remain valid
matches_to_replace.sort(key=lambda match: match.start)
offset = 0
for match in matches_to_replace:
html = (
html[: match.start + offset]
+ match.replacement
+ html[match.end + offset :]
)
offset += len(match.replacement) - match.end + match.start
return html
def extract_tags(self, html: str) -> Dict[str, List[TagMatch]]:
"""Helper method to extract and group HTML tags and their attributes.
Returns a dict of TagMatch objects, mapping tag types to a list of all TagMatch objects of that tag type.
"""
matches_by_tag_type = defaultdict(list)
# Regex used to match <tag ...> tags in the HTML.
re_pattern = self.get_opening_tag_regex()
for re_match in re_pattern.finditer(html):
tag_match = TagMatch(re_match)
tag_type = self.get_tag_type_from_attrs(tag_match.attrs)
matches_by_tag_type[tag_type].append(tag_match)
return matches_by_tag_type
def convert_rule_to_bulk_rule(self, rule: Callable) -> Callable:
def bulk_rule(args):
return list(map(rule, args))
return bulk_rule
def extract_references(self, html):
re_pattern = self.get_opening_tag_regex()
for match in re_pattern.findall(html):
attrs = extract_attrs(match)
tag_type = self.get_tag_type_from_attrs(attrs)
if tag_type not in self.reference_extractors:
continue
yield from self.reference_extractors[tag_type](attrs)
return []
class EmbedRewriter(TagRewriter):
"""
Rewrites <embed embedtype="foo" /> tags within rich text into the HTML
fragment given by the embed rule for 'foo'. Each embed rule is a function
that takes a dict of attributes and returns the HTML fragment.
"""
def get_opening_tag_regex(self):
return FIND_EMBED_TAG
def get_tag_type_from_attrs(self, attrs):
return attrs.get("embedtype")
def get_tag_replacements(self, tag_type, attrs_list):
try:
rule = self.bulk_rules[tag_type]
except KeyError:
rule = None
if not rule:
try:
rule = self.rules[tag_type]
except KeyError:
pass
else:
rule = self.convert_rule_to_bulk_rule(rule)
# Silently drop any tags with an unrecognised or missing embedtype attribute.
return rule(attrs_list) if rule else [""] * len(attrs_list)
class LinkRewriter(TagRewriter):
"""
Rewrites <a linktype="foo"> tags within rich text into the HTML fragment
given by the rule for 'foo'. Each link rule is a function that takes a dict
of attributes and returns the HTML fragment for the opening tag (only).
"""
def get_opening_tag_regex(self):
return FIND_A_TAG
def get_tag_type_from_attrs(self, attrs):
try:
return attrs["linktype"]
except KeyError:
href = attrs.get("href", None)
if href:
# From href attribute we try to detect only the linktypes that we
# currently support (`external` & `email`, `page` has a default handler)
# from the link chooser.
if href.startswith(("http:", "https:")):
return "external"
elif href.startswith("mailto:"):
return "email"
elif href.startswith("#"):
return "anchor"
def get_tag_replacements(self, tag_type, attrs_list):
if not tag_type:
# We want to leave links without a linktype attribute unchanged,
# for example <a name="important-anchor">, so we return an empty
# list here so that no tag replacements are made.
return []
try:
rule = self.bulk_rules[tag_type]
except KeyError:
rule = None
if not rule:
try:
rule = self.rules[tag_type]
except KeyError:
if tag_type in ["email", "external", "anchor"]:
# We also want to leave links with certain known linktype
# attributes alone even if there are no richtext rules
# registered for those types, for example
# <a href="https://wagtail.org">, so we return an empty
# list here so that no tag replacements are made.
return []
else:
rule = self.convert_rule_to_bulk_rule(rule)
# Replace unrecognised link types with an empty link.
return rule(attrs_list) if rule else ["<a>"] * len(attrs_list)
class MultiRuleRewriter:
"""Rewrites HTML by applying a sequence of rewriter functions"""
def __init__(self, rewriters):
self.rewriters = rewriters
def __call__(self, html):
for rewrite in self.rewriters:
html = rewrite(html)
return html
def extract_references(self, html):
for rewriter in self.rewriters:
yield from rewriter.extract_references(html)