# HG changeset patch
# User Paul Boddie <paul@boddie.org.uk>
# Date 1533394669 -7200
# Node ID bc150f6c2567f7c7be4f789b93a3b76eb0c311dd
# Parent  d234e6e97a5cf92026b4584354bd7cd6b2a74138
Introduced a heading identification phase, assigning identifiers to headings in
documents so that they may be unambiguously referenced. Such identifiers are set
on Heading nodes and are retrieved when serialising those nodes.
Moved the identifier encoding operation into the HTML linker abstraction,
applying this operation specifically to fragment identifiers within non-URL link
targets.

diff -r d234e6e97a5c -r bc150f6c2567 moinformat/links/html.py
--- a/moinformat/links/html.py	Fri Aug 03 22:34:55 2018 +0200
+++ b/moinformat/links/html.py	Sat Aug 04 16:57:49 2018 +0200
@@ -20,15 +20,9 @@
 """
 
 from moinformat.links.common import Linker
-from urllib import quote as _quote
+from urllib import quote, quote_plus
 from urlparse import urlparse
 
-def quote(s):
-
-    "Quote URL path 's', preserving path separators and fragment indicators."
-
-    return "#".join(map(_quote, s.split("#", 1)))
-
 class HTMLLinker(Linker):
 
     "Translate Moin links into HTML links."
@@ -89,7 +83,7 @@
         # Top-level pages.
 
         top_level = self.get_top_level()
-        return quote("%s%s" % (top_level and "%s/" % top_level or "", target)), None
+        return self.quote("%s%s" % (top_level and "%s/" % top_level or "", target)), None
 
     def translate_qualified_link(self, target):
 
@@ -125,26 +119,101 @@
 
         "Return a translation of the given attachment 'target'."
 
-        return quote("%sattachments/%s/%s" % (
+        return self.quote("%sattachments/%s/%s" % (
             self.get_top_level(), self.pagename, target))
 
     def translate_interwiki(self, url, target):
 
         "Return a translation of the given interwiki 'target'."
 
-        return "%s%s" % (self.normalise(url), quote(target))
+        return "%s%s" % (self.normalise(url), self.quote(target))
 
     def translate_relative(self, target):
 
         "Return a translation of the given relative 'target'."
 
-        return quote(target[len("../"):])
+        return self.quote(target[len("../"):])
 
     def translate_subpage(self, target):
 
         "Return a translation of the given subpage 'target'."
 
-        return quote(".%s" % target)
+        return self.quote(".%s" % target)
+
+    # Path encoding.
+
+    def quote(self, s):
+
+        """
+        Quote URL path 's', preserving path separators and fragment indicators,
+        encoding fragment identifiers.
+        """
+
+        parts = s.split("#", 1)
+
+        if len(parts) > 1:
+            parts[1] = self.make_id(parts[1])
+
+        return "#".join(map(quote, parts))
+
+    # Identifier encoding.
+
+    def make_id(self, s):
+
+        "Make a suitable identifier for HTML element identification."
+
+        # NOTE: This reproduces the Moin algorithm for compatibility.
+        # NOTE: There may well be improvements possible, possibly by replacing plus
+        # NOTE: with something less cumbersome, even though plus may be unusual in
+        # NOTE: things like headings, anyway.
+
+        # The desired output is the following pattern:
+
+        # [A-Za-z][-_:.A-Za-z0-9]*
+
+        # The Python UTF-7 encoder preserves symbols and it encodes + as +- with an
+        # output range as follows (in addition to A-Za-z0-9):
+
+        # -_:.%+ !"#$&\'()*,/;<=>?@[]^`{|}
+
+        # The quote_plus function converts space to plus, preserves -_:. and encodes
+        # all other symbols (including original occurrences of plus and percent) and
+        # non-alphanumeric (ASCII) characters using percent encoding.
+
+        # With colons preserved, the resulting output is in the following range
+        # (in addition to A-Za-z0-9):
+
+        # -_:.%+
+
+        # Percent will only occur as an encoding prefix. Plus will only occur as a
+        # replacement for space.
+
+        # Combining quote_plus and UTF-7 gives the following range (in addition to
+        # A-Za-z0-9):
+
+        # -_:.%+
+
+        # Examples:
+
+        #          UTF-7         quote_plus    replace percent and plus
+        # :     -> :          -> :          -> :
+        # -     -> -          -> -          -> -
+        # .     -> .          -> .          -> .
+        # %     -> %          -> %25        -> .25
+        # +     -> +-         -> %2B-       -> .2B-
+        # _     -> _          -> _          -> _
+        # space -> space      -> +          -> _
+
+        # See: RFC2152 - UTF-7 A Mail-Safe Transformation Format of Unicode
+
+        quoted = quote_plus(s.encode("utf-7"), ":").replace("%", ".").replace("+", "_")
+
+        # Ensure that the identifier starts with an alphabetical character.
+
+        if not quoted[0].isalpha():
+            return "A%s" % quoted
+        else:
+            return quoted
 
 linker = HTMLLinker
 
diff -r d234e6e97a5c -r bc150f6c2567 moinformat/macros/toc.py
--- a/moinformat/macros/toc.py	Fri Aug 03 22:34:55 2018 +0200
+++ b/moinformat/macros/toc.py	Sat Aug 04 16:57:49 2018 +0200
@@ -20,7 +20,6 @@
 """
 
 from moinformat.macros.common import Macro
-from moinformat.serialisers.common import make_id
 from moinformat.tree.moin import Container, Heading, Link, List, ListItem, Text
 
 class TableOfContents(Macro):
@@ -173,8 +172,7 @@
 
         "Return nodes for an entry involving 'heading'."
 
-        target = make_id(heading.text_content())
-        return [Link(heading.nodes[:], "#%s" % target), Text("\n")]
+        return [Link(heading.nodes[:], "#%s" % heading.identifier), Text("\n")]
 
 macro = TableOfContents
 
diff -r d234e6e97a5c -r bc150f6c2567 moinformat/parsers/moin.py
--- a/moinformat/parsers/moin.py	Fri Aug 03 22:34:55 2018 +0200
+++ b/moinformat/parsers/moin.py	Sat Aug 04 16:57:49 2018 +0200
@@ -70,6 +70,10 @@
 
         self.macros = []
 
+        # Record headings for identifier disambiguation.
+
+        self.headings = []
+
     # Principal parser methods.
 
     def parse(self, s):
@@ -96,6 +100,10 @@
         else:
             self.parse_region_type(self.region)
 
+        # Assign heading identifiers.
+
+        self.identify_headings()
+
         return self.region
 
 
@@ -119,6 +127,24 @@
             macro = macro_cls(node, self.region)
             macro.evaluate()
 
+    # Heading disambiguation.
+
+    def identify_headings(self):
+
+        "Assign identifiers to headings based on their textual content."
+
+        d = {}
+
+        for heading in self.headings:
+            text = heading.text_content()
+
+            if not d.has_key(text):
+                d[text] = 0
+                heading.identifier = text
+            else:
+                d[text] += 1
+                heading.identifier = "%s-%d" % (text, d[text])
+
 
 
     # Parser methods supporting different page features.
@@ -238,6 +264,10 @@
         self.add_node(region, heading)
         self.new_block(region)
 
+        # Record the heading for later processing.
+
+        self.root.headings.append(heading)
+
     def parse_heading_end(self, heading):
 
         "Handle the end of a heading."
diff -r d234e6e97a5c -r bc150f6c2567 moinformat/serialisers/common.py
--- a/moinformat/serialisers/common.py	Fri Aug 03 22:34:55 2018 +0200
+++ b/moinformat/serialisers/common.py	Sat Aug 04 16:57:49 2018 +0200
@@ -19,8 +19,6 @@
 this program.  If not, see <http://www.gnu.org/licenses/>.
 """
 
-from urllib import quote_plus
-
 class Serialiser:
 
     "General serialisation support."
@@ -100,61 +98,4 @@
 
     return s.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
 
-def make_id(s):
-
-    "Make a suitable identifier for XML element identification."
-
-    # NOTE: This reproduces the Moin algorithm for compatibility.
-    # NOTE: There may well be improvements possible, possibly by replacing plus
-    # NOTE: with something less cumbersome, even though plus may be unusual in
-    # NOTE: things like headings, anyway.
-
-    # The desired output is the following pattern:
-
-    # [A-Za-z][-_:.A-Za-z0-9]*
-
-    # The Python UTF-7 encoder preserves symbols and it encodes + as +- with an
-    # output range as follows (in addition to A-Za-z0-9):
-
-    # -_:.%+ !"#$&\'()*,/;<=>?@[]^`{|}
-
-    # The quote_plus function converts space to plus, preserves -_:. and encodes
-    # all other symbols (including original occurrences of plus and percent) and
-    # non-alphanumeric (ASCII) characters using percent encoding.
-
-    # With colons preserved, the resulting output is in the following range
-    # (in addition to A-Za-z0-9):
-
-    # -_:.%+
-
-    # Percent will only occur as an encoding prefix. Plus will only occur as a
-    # replacement for space.
-
-    # Combining quote_plus and UTF-7 gives the following range (in addition to
-    # A-Za-z0-9):
-
-    # -_:.%+
-
-    # Examples:
-
-    #          UTF-7         quote_plus    replace percent and plus
-    # :     -> :          -> :          -> :
-    # -     -> -          -> -          -> -
-    # .     -> .          -> .          -> .
-    # %     -> %          -> %25        -> .25
-    # +     -> +-         -> %2B-       -> .2B-
-    # _     -> _          -> _          -> _
-    # space -> space      -> +          -> _
-
-    # See: RFC2152 - UTF-7 A Mail-Safe Transformation Format of Unicode
-
-    quoted = quote_plus(s.encode("utf-7"), ":").replace("%", ".").replace("+", "_")
-
-    # Ensure that the identifier starts with an alphabetical character.
-
-    if not quoted[0].isalpha():
-        return "A%s" % quoted
-    else:
-        return quoted
-
 # vim: tabstop=4 expandtab shiftwidth=4
diff -r d234e6e97a5c -r bc150f6c2567 moinformat/serialisers/html/moin.py
--- a/moinformat/serialisers/html/moin.py	Fri Aug 03 22:34:55 2018 +0200
+++ b/moinformat/serialisers/html/moin.py	Sat Aug 04 16:57:49 2018 +0200
@@ -19,8 +19,7 @@
 this program.  If not, see <http://www.gnu.org/licenses/>.
 """
 
-from moinformat.serialisers.common import escape_attr, escape_text, make_id, \
-                                          Serialiser
+from moinformat.serialisers.common import escape_attr, escape_text, Serialiser
 
 class HTMLSerialiser(Serialiser):
 
@@ -89,8 +88,8 @@
     def end_emphasis(self):
         self.out("</em>")
 
-    def start_heading(self, level, extra, pad, text):
-        self.out("<h%d id='%s'>" % (level, escape_attr(make_id(text))))
+    def start_heading(self, level, extra, pad, identifier):
+        self.out("<h%d id='%s'>" % (level, escape_attr(self.linker.make_id(identifier))))
 
     def end_heading(self, level, pad, extra):
         self.out("</h%d>" % level)
@@ -249,7 +248,7 @@
         self.out("</span>")
 
     def anchor(self, target):
-        self.out("<a name='%s' />" % escape_attr(make_id(target)))
+        self.out("<a name='%s' />" % escape_attr(self.linker.make_id(target)))
 
     def break_(self):
         pass
diff -r d234e6e97a5c -r bc150f6c2567 moinformat/serialisers/moin/moin.py
--- a/moinformat/serialisers/moin/moin.py	Fri Aug 03 22:34:55 2018 +0200
+++ b/moinformat/serialisers/moin/moin.py	Sat Aug 04 16:57:49 2018 +0200
@@ -66,7 +66,7 @@
     def end_emphasis(self):
         self.out("''")
 
-    def start_heading(self, level, extra, pad, text):
+    def start_heading(self, level, extra, pad, identifier):
         self.out(extra + "=" * level + pad)
 
     def end_heading(self, level, pad, extra):
diff -r d234e6e97a5c -r bc150f6c2567 moinformat/tree/moin.py
--- a/moinformat/tree/moin.py	Fri Aug 03 22:34:55 2018 +0200
+++ b/moinformat/tree/moin.py	Sat Aug 04 16:57:49 2018 +0200
@@ -309,25 +309,30 @@
 
     "A heading."
 
-    def __init__(self, nodes, level, start_extra="", start_pad="", end_pad="", end_extra=""):
+    def __init__(self, nodes, level, start_extra="", start_pad="", end_pad="", end_extra="",
+                 identifier=None):
         Container.__init__(self, nodes)
         self.level = level
         self.start_extra = start_extra
         self.start_pad = start_pad
         self.end_pad = end_pad
         self.end_extra = end_extra
+        self.identifier = identifier
 
     def __repr__(self):
-        return "Heading(%r, %d, %r, %r, %r, %r)" % (
-            self.nodes, self.level, self.start_extra, self.start_pad, self.end_pad, self.end_extra)
+        return "Heading(%r, %d, %r, %r, %r, %r, %r)" % (
+            self.nodes, self.level, self.start_extra, self.start_pad,
+            self.end_pad, self.end_extra, self.identifier)
 
     def prettyprint(self, indent=""):
-        l = ["%sHeading: level=%d start_extra=%r start_pad=%r end_pad=%r end_extra=%r" % (
-                indent, self.level, self.start_extra, self.start_pad, self.end_pad, self.end_extra)]
+        l = ["%sHeading: level=%d start_extra=%r start_pad=%r end_pad=%r"
+             " end_extra=%r identifier=%r" % (
+             indent, self.level, self.start_extra, self.start_pad, self.end_pad,
+             self.end_extra, self.identifier)]
         return self._prettyprint(l, indent)
 
     def to_string(self, out):
-        out.start_heading(self.level, self.start_extra, self.start_pad, self.text_content())
+        out.start_heading(self.level, self.start_extra, self.start_pad, self.identifier)
         self._to_string(out)
         out.end_heading(self.level, self.end_pad, self.end_extra)