# HG changeset patch # User Paul Boddie # Date 1533394669 -7200 # Node ID bc150f6c2567f7c7be4f789b93a3b76eb0c311dd # Parent d234e6e97a5cf92026b4584354bd7cd6b2a74138 Introduced a heading identification phase, assigning identifiers to headings in documents so that they may be unambiguously referenced. Such identifiers are set on Heading nodes and are retrieved when serialising those nodes. Moved the identifier encoding operation into the HTML linker abstraction, applying this operation specifically to fragment identifiers within non-URL link targets. diff -r d234e6e97a5c -r bc150f6c2567 moinformat/links/html.py --- a/moinformat/links/html.py Fri Aug 03 22:34:55 2018 +0200 +++ b/moinformat/links/html.py Sat Aug 04 16:57:49 2018 +0200 @@ -20,15 +20,9 @@ """ from moinformat.links.common import Linker -from urllib import quote as _quote +from urllib import quote, quote_plus from urlparse import urlparse -def quote(s): - - "Quote URL path 's', preserving path separators and fragment indicators." - - return "#".join(map(_quote, s.split("#", 1))) - class HTMLLinker(Linker): "Translate Moin links into HTML links." @@ -89,7 +83,7 @@ # Top-level pages. top_level = self.get_top_level() - return quote("%s%s" % (top_level and "%s/" % top_level or "", target)), None + return self.quote("%s%s" % (top_level and "%s/" % top_level or "", target)), None def translate_qualified_link(self, target): @@ -125,26 +119,101 @@ "Return a translation of the given attachment 'target'." - return quote("%sattachments/%s/%s" % ( + return self.quote("%sattachments/%s/%s" % ( self.get_top_level(), self.pagename, target)) def translate_interwiki(self, url, target): "Return a translation of the given interwiki 'target'." - return "%s%s" % (self.normalise(url), quote(target)) + return "%s%s" % (self.normalise(url), self.quote(target)) def translate_relative(self, target): "Return a translation of the given relative 'target'." - return quote(target[len("../"):]) + return self.quote(target[len("../"):]) def translate_subpage(self, target): "Return a translation of the given subpage 'target'." - return quote(".%s" % target) + return self.quote(".%s" % target) + + # Path encoding. + + def quote(self, s): + + """ + Quote URL path 's', preserving path separators and fragment indicators, + encoding fragment identifiers. + """ + + parts = s.split("#", 1) + + if len(parts) > 1: + parts[1] = self.make_id(parts[1]) + + return "#".join(map(quote, parts)) + + # Identifier encoding. + + def make_id(self, s): + + "Make a suitable identifier for HTML element identification." + + # NOTE: This reproduces the Moin algorithm for compatibility. + # NOTE: There may well be improvements possible, possibly by replacing plus + # NOTE: with something less cumbersome, even though plus may be unusual in + # NOTE: things like headings, anyway. + + # The desired output is the following pattern: + + # [A-Za-z][-_:.A-Za-z0-9]* + + # The Python UTF-7 encoder preserves symbols and it encodes + as +- with an + # output range as follows (in addition to A-Za-z0-9): + + # -_:.%+ !"#$&\'()*,/;<=>?@[]^`{|} + + # The quote_plus function converts space to plus, preserves -_:. and encodes + # all other symbols (including original occurrences of plus and percent) and + # non-alphanumeric (ASCII) characters using percent encoding. + + # With colons preserved, the resulting output is in the following range + # (in addition to A-Za-z0-9): + + # -_:.%+ + + # Percent will only occur as an encoding prefix. Plus will only occur as a + # replacement for space. + + # Combining quote_plus and UTF-7 gives the following range (in addition to + # A-Za-z0-9): + + # -_:.%+ + + # Examples: + + # UTF-7 quote_plus replace percent and plus + # : -> : -> : -> : + # - -> - -> - -> - + # . -> . -> . -> . + # % -> % -> %25 -> .25 + # + -> +- -> %2B- -> .2B- + # _ -> _ -> _ -> _ + # space -> space -> + -> _ + + # See: RFC2152 - UTF-7 A Mail-Safe Transformation Format of Unicode + + quoted = quote_plus(s.encode("utf-7"), ":").replace("%", ".").replace("+", "_") + + # Ensure that the identifier starts with an alphabetical character. + + if not quoted[0].isalpha(): + return "A%s" % quoted + else: + return quoted linker = HTMLLinker diff -r d234e6e97a5c -r bc150f6c2567 moinformat/macros/toc.py --- a/moinformat/macros/toc.py Fri Aug 03 22:34:55 2018 +0200 +++ b/moinformat/macros/toc.py Sat Aug 04 16:57:49 2018 +0200 @@ -20,7 +20,6 @@ """ from moinformat.macros.common import Macro -from moinformat.serialisers.common import make_id from moinformat.tree.moin import Container, Heading, Link, List, ListItem, Text class TableOfContents(Macro): @@ -173,8 +172,7 @@ "Return nodes for an entry involving 'heading'." - target = make_id(heading.text_content()) - return [Link(heading.nodes[:], "#%s" % target), Text("\n")] + return [Link(heading.nodes[:], "#%s" % heading.identifier), Text("\n")] macro = TableOfContents diff -r d234e6e97a5c -r bc150f6c2567 moinformat/parsers/moin.py --- a/moinformat/parsers/moin.py Fri Aug 03 22:34:55 2018 +0200 +++ b/moinformat/parsers/moin.py Sat Aug 04 16:57:49 2018 +0200 @@ -70,6 +70,10 @@ self.macros = [] + # Record headings for identifier disambiguation. + + self.headings = [] + # Principal parser methods. def parse(self, s): @@ -96,6 +100,10 @@ else: self.parse_region_type(self.region) + # Assign heading identifiers. + + self.identify_headings() + return self.region @@ -119,6 +127,24 @@ macro = macro_cls(node, self.region) macro.evaluate() + # Heading disambiguation. + + def identify_headings(self): + + "Assign identifiers to headings based on their textual content." + + d = {} + + for heading in self.headings: + text = heading.text_content() + + if not d.has_key(text): + d[text] = 0 + heading.identifier = text + else: + d[text] += 1 + heading.identifier = "%s-%d" % (text, d[text]) + # Parser methods supporting different page features. @@ -238,6 +264,10 @@ self.add_node(region, heading) self.new_block(region) + # Record the heading for later processing. + + self.root.headings.append(heading) + def parse_heading_end(self, heading): "Handle the end of a heading." diff -r d234e6e97a5c -r bc150f6c2567 moinformat/serialisers/common.py --- a/moinformat/serialisers/common.py Fri Aug 03 22:34:55 2018 +0200 +++ b/moinformat/serialisers/common.py Sat Aug 04 16:57:49 2018 +0200 @@ -19,8 +19,6 @@ this program. If not, see . """ -from urllib import quote_plus - class Serialiser: "General serialisation support." @@ -100,61 +98,4 @@ return s.replace("&", "&").replace("<", "<").replace(">", ">") -def make_id(s): - - "Make a suitable identifier for XML element identification." - - # NOTE: This reproduces the Moin algorithm for compatibility. - # NOTE: There may well be improvements possible, possibly by replacing plus - # NOTE: with something less cumbersome, even though plus may be unusual in - # NOTE: things like headings, anyway. - - # The desired output is the following pattern: - - # [A-Za-z][-_:.A-Za-z0-9]* - - # The Python UTF-7 encoder preserves symbols and it encodes + as +- with an - # output range as follows (in addition to A-Za-z0-9): - - # -_:.%+ !"#$&\'()*,/;<=>?@[]^`{|} - - # The quote_plus function converts space to plus, preserves -_:. and encodes - # all other symbols (including original occurrences of plus and percent) and - # non-alphanumeric (ASCII) characters using percent encoding. - - # With colons preserved, the resulting output is in the following range - # (in addition to A-Za-z0-9): - - # -_:.%+ - - # Percent will only occur as an encoding prefix. Plus will only occur as a - # replacement for space. - - # Combining quote_plus and UTF-7 gives the following range (in addition to - # A-Za-z0-9): - - # -_:.%+ - - # Examples: - - # UTF-7 quote_plus replace percent and plus - # : -> : -> : -> : - # - -> - -> - -> - - # . -> . -> . -> . - # % -> % -> %25 -> .25 - # + -> +- -> %2B- -> .2B- - # _ -> _ -> _ -> _ - # space -> space -> + -> _ - - # See: RFC2152 - UTF-7 A Mail-Safe Transformation Format of Unicode - - quoted = quote_plus(s.encode("utf-7"), ":").replace("%", ".").replace("+", "_") - - # Ensure that the identifier starts with an alphabetical character. - - if not quoted[0].isalpha(): - return "A%s" % quoted - else: - return quoted - # vim: tabstop=4 expandtab shiftwidth=4 diff -r d234e6e97a5c -r bc150f6c2567 moinformat/serialisers/html/moin.py --- a/moinformat/serialisers/html/moin.py Fri Aug 03 22:34:55 2018 +0200 +++ b/moinformat/serialisers/html/moin.py Sat Aug 04 16:57:49 2018 +0200 @@ -19,8 +19,7 @@ this program. If not, see . """ -from moinformat.serialisers.common import escape_attr, escape_text, make_id, \ - Serialiser +from moinformat.serialisers.common import escape_attr, escape_text, Serialiser class HTMLSerialiser(Serialiser): @@ -89,8 +88,8 @@ def end_emphasis(self): self.out("") - def start_heading(self, level, extra, pad, text): - self.out("" % (level, escape_attr(make_id(text)))) + def start_heading(self, level, extra, pad, identifier): + self.out("" % (level, escape_attr(self.linker.make_id(identifier)))) def end_heading(self, level, pad, extra): self.out("" % level) @@ -249,7 +248,7 @@ self.out("") def anchor(self, target): - self.out("" % escape_attr(make_id(target))) + self.out("" % escape_attr(self.linker.make_id(target))) def break_(self): pass diff -r d234e6e97a5c -r bc150f6c2567 moinformat/serialisers/moin/moin.py --- a/moinformat/serialisers/moin/moin.py Fri Aug 03 22:34:55 2018 +0200 +++ b/moinformat/serialisers/moin/moin.py Sat Aug 04 16:57:49 2018 +0200 @@ -66,7 +66,7 @@ def end_emphasis(self): self.out("''") - def start_heading(self, level, extra, pad, text): + def start_heading(self, level, extra, pad, identifier): self.out(extra + "=" * level + pad) def end_heading(self, level, pad, extra): diff -r d234e6e97a5c -r bc150f6c2567 moinformat/tree/moin.py --- a/moinformat/tree/moin.py Fri Aug 03 22:34:55 2018 +0200 +++ b/moinformat/tree/moin.py Sat Aug 04 16:57:49 2018 +0200 @@ -309,25 +309,30 @@ "A heading." - def __init__(self, nodes, level, start_extra="", start_pad="", end_pad="", end_extra=""): + def __init__(self, nodes, level, start_extra="", start_pad="", end_pad="", end_extra="", + identifier=None): Container.__init__(self, nodes) self.level = level self.start_extra = start_extra self.start_pad = start_pad self.end_pad = end_pad self.end_extra = end_extra + self.identifier = identifier def __repr__(self): - return "Heading(%r, %d, %r, %r, %r, %r)" % ( - self.nodes, self.level, self.start_extra, self.start_pad, self.end_pad, self.end_extra) + return "Heading(%r, %d, %r, %r, %r, %r, %r)" % ( + self.nodes, self.level, self.start_extra, self.start_pad, + self.end_pad, self.end_extra, self.identifier) def prettyprint(self, indent=""): - l = ["%sHeading: level=%d start_extra=%r start_pad=%r end_pad=%r end_extra=%r" % ( - indent, self.level, self.start_extra, self.start_pad, self.end_pad, self.end_extra)] + l = ["%sHeading: level=%d start_extra=%r start_pad=%r end_pad=%r" + " end_extra=%r identifier=%r" % ( + indent, self.level, self.start_extra, self.start_pad, self.end_pad, + self.end_extra, self.identifier)] return self._prettyprint(l, indent) def to_string(self, out): - out.start_heading(self.level, self.start_extra, self.start_pad, self.text_content()) + out.start_heading(self.level, self.start_extra, self.start_pad, self.identifier) self._to_string(out) out.end_heading(self.level, self.end_pad, self.end_extra)