1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/moinformat/parsers/html.py Sat Jul 01 00:43:48 2023 +0200
1.3 @@ -0,0 +1,82 @@
1.4 +#!/usr/bin/env python
1.5 +
1.6 +"""
1.7 +HTML document fragment parser.
1.8 +
1.9 +Copyright (C) 2023 Paul Boddie <paul@boddie.org.uk>
1.10 +
1.11 +This program is free software; you can redistribute it and/or modify it under
1.12 +the terms of the GNU General Public License as published by the Free Software
1.13 +Foundation; either version 3 of the License, or (at your option) any later
1.14 +version.
1.15 +
1.16 +This program is distributed in the hope that it will be useful, but WITHOUT
1.17 +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
1.18 +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
1.19 +details.
1.20 +
1.21 +You should have received a copy of the GNU General Public License along with
1.22 +this program. If not, see <http://www.gnu.org/licenses/>.
1.23 +"""
1.24 +
1.25 +from moinformat.parsers.common import ParserBase
1.26 +from moinformat.tree.html import Element, Fragment
1.27 +from moinformat.utils.htmlparse import Parser
1.28 +
1.29 +class HTMLParser(ParserBase):
1.30 +
1.31 + "A prettyprinted document tree parser."
1.32 +
1.33 + formats = ["html"]
1.34 +
1.35 + def __init__(self, metadata):
1.36 + self.metadata = metadata
1.37 +
1.38 + def parse(self, s):
1.39 +
1.40 + "Parse the tree structure representation in 's'."
1.41 +
1.42 + doc = Parser(s).parse()
1.43 +
1.44 + # If a theme is going to be used with the document, find the body node
1.45 + # and return its children in a fragment.
1.46 +
1.47 + if self.metadata.get("theme_name"):
1.48 + body = self._find_body(doc)
1.49 +
1.50 + if body:
1.51 + return Fragment(body.nodes)
1.52 + else:
1.53 + return None
1.54 +
1.55 + # Otherwise, return the top-level node.
1.56 +
1.57 + else:
1.58 + return doc
1.59 +
1.60 + def _find_body(self, node):
1.61 +
1.62 + """
1.63 + Find the body element from 'node', returning the element if found or
1.64 + None otherwise.
1.65 + """
1.66 +
1.67 + # Search all nodes with children.
1.68 +
1.69 + if isinstance(node, Fragment):
1.70 +
1.71 + # Return the node if it is a body element.
1.72 +
1.73 + if isinstance(node, Element) and node.name == "body":
1.74 + return node
1.75 +
1.76 + for n in node.nodes:
1.77 + body = self._find_body(n)
1.78 + if body:
1.79 + return body
1.80 +
1.81 + return None
1.82 +
1.83 +parser = HTMLParser
1.84 +
1.85 +# vim: tabstop=4 expandtab shiftwidth=4