Edgewall Software

Ticket #108: html5_support.2.diff

File html5_support.2.diff, 12.9 KB (added by tbroyer, 17 years ago)

Revised patch (better handling of elements/attributes outside the HTML namespace in output) + adds an HTML5Template class

  • genshi/__init__.py

     
    2626    pass
    2727
    2828from genshi.core import *
    29 from genshi.input import ParseError, XML, HTML
     29from genshi.input import ParseError, XML, HTML, HTML5
  • genshi/input.py

     
    2525import htmlentitydefs
    2626from StringIO import StringIO
    2727
    28 from genshi.core import Attrs, QName, Stream, stripentities
     28from genshi.core import Attrs, Namespace, QName, Stream, stripentities
    2929from genshi.core import DOCTYPE, START, END, START_NS, END_NS, TEXT, \
    3030                        START_CDATA, END_CDATA, PI, COMMENT
    3131
    32 __all__ = ['ET', 'ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML']
     32__all__ = ['ET', 'ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML', 'HTML5Parser', 'HTML5']
    3333__docformat__ = 'restructuredtext en'
    3434
    3535def ET(element):
     
    426426    """
    427427    return Stream(list(HTMLParser(StringIO(text), encoding=encoding)))
    428428
     429class HTML5Parser(object):
     430    """Parser for HTML input based on `html5lib`.
     431   
     432    This class provides the same interface for generating stream events as
     433    `XMLParser`.
     434   
     435    The parsing is initiated by iterating over the parser object:
     436   
     437    >>> parser = HTML5Parser(StringIO('<UL compact><LI>Foo</UL>'))
     438    >>> for kind, data, pos in parser:
     439    ...     print kind, repr(data)
     440    START (QName(u'html'), Attrs())
     441    START (QName(u'head'), Attrs())
     442    END QName(u'head')
     443    START (QName(u'body'), Attrs())
     444    START (QName(u'ul'), Attrs([(QName(u'compact'), '')]))
     445    START (QName(u'li'), Attrs())
     446    TEXT u'Foo'
     447    END QName(u'li')
     448    END QName(u'ul')
     449    END QName(u'body')
     450    END QName(u'html')
     451    """
     452   
     453    html = Namespace('http://www.w3.org/1999/xhtml')
     454
     455    def __init__(self, source, filename=None, encoding=None, innerHTML=False):
     456        """Initialize the parser for the given HTML input.
     457       
     458        :param source: the HTML text as a file-like object
     459        :param filename: the name of the file, if known
     460        :param encoding: encoding of the file; ignored if the input is unicode
     461        :param innerHTML: are we parsing in innerHTML mode (innerHTML=True is not yet supported by html5lib)
     462        """
     463        self.source = source
     464        self.filename = filename
     465        self.encoding = encoding
     466        self.innerHTML = innerHTML
     467        import html5lib
     468        self.parser = html5lib.HTMLParser()
     469
     470    def parse(self):
     471        """Generator that parses the HTML source, yielding markup events.
     472       
     473        :return: a markup event stream
     474        """
     475        # TODO: Add some basic namespace support, e.g. convert known prefixes (py:, svg:, mathml:, smil:) to QNames
     476        document = self.parser.parse(self.source, encoding=self.encoding, innerHTML=self.innerHTML)
     477        return self._generate(document)
     478
     479    def __iter__(self):
     480        return iter(self.parse())
     481   
     482    def _generate(self, element):
     483        from html5lib.treebuilders.simpletree import Document, DocumentType, CommentNode, TextNode
     484
     485        pos = (self.filename, -1, -1)
     486
     487        if isinstance(element, Document):
     488            for child in element.childNodes:
     489                for kind, data, pos in self._generate(child):
     490                    yield kind, data, pos
     491
     492        elif isinstance(element, DocumentType):
     493            yield DOCTYPE, (element.name, None, None), pos
     494
     495        elif isinstance(element, CommentNode):
     496            yield COMMENT, element.data, pos
     497
     498        elif isinstance(element, TextNode):
     499            yield TEXT, element.value, pos
     500
     501        else: # Element
     502            tag_name = self.html[element.name]
     503            attrs = Attrs([(self.html[attr], value) for attr, value in element.attributes.iteritems()])
     504            yield START, (tag_name, attrs), pos
     505            for child in element.childNodes:
     506                for kind, data, pos in self._generate(child):
     507                    yield kind, data, pos
     508            yield END, tag_name, pos
     509
     510
     511def HTML5(text, encoding=None, strict=False, innerHTML=False):
     512    """Parse the given HTML source and return a markup stream.
     513   
     514    Unlike with `HTML5Parser`, the returned stream is reusable, meaning it can be
     515    iterated over multiple times:
     516   
     517    >>> html = HTML5('<body><h1>Foo</h1></body>')
     518    >>> print html
     519    <html xmlns="http://www.w3.org/1999/xhtml"><head/><body><h1>Foo</h1></body></html>
     520    >>> print html.select('body/h1')
     521    <h1 xmlns="http://www.w3.org/1999/xhtml">Foo</h1>
     522    >>> print html.select('body/h1/text()')
     523    Foo
     524   
     525    :param text: the HTML source
     526    :return: the parsed XML event stream
     527    """
     528    return Stream(list(HTML5Parser(StringIO(text), encoding=encoding)))
     529
    429530def _coalesce(stream):
    430531    """Coalesces adjacent TEXT events into a single event."""
    431532    textbuf = []
  • genshi/output.py

     
    2727                        START_CDATA, END_CDATA, PI, COMMENT, XML_NAMESPACE
    2828
    2929__all__ = ['DocType', 'XMLSerializer', 'XHTMLSerializer', 'HTMLSerializer',
    30            'TextSerializer']
     30           'TextSerializer', 'HTML5Serializer']
    3131__docformat__ = 'restructuredtext en'
    3232
    3333
     
    5353        'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
    5454    )
    5555    XHTML = XHTML_STRICT
     56   
     57    HTML5 = ('html', None, None)
    5658
    5759
    5860class XMLSerializer(object):
     
    321323                yield Markup('<?%s %s?>' % data)
    322324
    323325
     326class HTML5Serializer(object):
     327    _NOESCAPE_ELEMS = frozenset(['style', 'script', 'xmp', 'iframe', 'noembed',
     328                                  'noframes', 'noscript'])
     329
     330    _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'bgsound', 'br',
     331                              'col', 'embed', 'frame', 'hr', 'img', 'input',
     332                              'link', 'meta', 'param', 'spacer', 'wbr'])
     333
     334    def __init__(self, doctype=DocType.HTML5):
     335        self.preamble = []
     336        if doctype:
     337            self.preamble.append((DOCTYPE, doctype, (None, -1, -1)))
     338        self.filters = []
     339   
     340    def __call__(self, stream):
     341        empty_elems = self._EMPTY_ELEMS
     342        noescape_elems = self._NOESCAPE_ELEMS
     343        have_doctype = False
     344        noescape = None
     345        skip_content = None
     346        depth = 0
     347
     348        stream = chain(self.preamble, stream)
     349        for filter_ in self.filters:
     350            stream = filter_(stream)
     351        for kind, data, pos in stream:
     352
     353            if kind is START:
     354                if kind is START:
     355                    depth += 1
     356                tag, attrib = data
     357                if isinstance(tag, QName):
     358                    tag = tag.localname
     359                buf = ['<', tag]
     360                for attr, value in attrib:
     361                    if isinstance(attr, QName):
     362                        attr = attr.localname
     363                    buf += [' ', attr, '="', self.escape(value), '"']
     364                buf.append('>')
     365                yield Markup(u''.join(buf))
     366                if tag in noescape_elems:
     367                    noescape = depth
     368                if tag in empty_elems:
     369                    skip_content = depth
     370
     371            elif kind is END:
     372                if noescape == depth:
     373                    noescape = None
     374                if isinstance(data, QName):
     375                    data = data.localname
     376                yield Markup('</%s>' % data)
     377                if skip_content == depth:
     378                    # If we skip_content, we also don't generate the end tag
     379                    skip_content = None
     380                depth -= 1
     381
     382            elif kind is TEXT:
     383                if noescape:
     384                    yield data
     385                else:
     386                    yield self.escape(data)
     387
     388            elif kind is COMMENT:
     389                yield Markup('<!-%s-->' % data)
     390
     391            elif kind is DOCTYPE and not have_doctype:
     392                name, pubid, sysid = data
     393                buf = ['<!DOCTYPE %s']
     394                if pubid:
     395                    buf.append(' PUBLIC "%s"')
     396                elif sysid:
     397                    buf.append(' SYSTEM')
     398                if sysid:
     399                    buf.append(' "%s"')
     400                buf.append('>\n')
     401                yield Markup(u''.join(buf), *filter(None, data))
     402                have_doctype = True
     403
     404            elif kind is PI:
     405                # This is not valid HTML5 but looks like an SGML PI
     406                yield Markup('<?%s %s>' % data)
     407
     408    def escape(text):
     409        return unicode(text).replace('&', '&amp;') \
     410                             .replace('<', '&lt;') \
     411                             .replace('>', '&gt;') \
     412                             .replace('"', '&quot;')
     413    escape = staticmethod(escape)
     414
     415
    324416class TextSerializer(object):
    325417    """Produces plain text from an event stream.
    326418   
  • genshi/template/html5.py

     
     1# -*- coding: utf-8 -*-
     2#
     3# Copyright (C) 2006-2007 Edgewall Software
     4# All rights reserved.
     5#
     6# This software is licensed as described in the file COPYING, which
     7# you should have received as part of this distribution. The terms
     8# are also available at http://genshi.edgewall.org/wiki/License.
     9#
     10# This software consists of voluntary contributions made by many
     11# individuals. For the exact contribution history, see the revision
     12# history and logs, available at http://genshi.edgewall.org/log/.
     13
     14"""Markup templating engine."""
     15
     16from genshi.core import Attrs, QName, Namespace, Stream
     17from genshi.core import START, END
     18from genshi.input import HTML5Parser
     19from genshi.template.markup import MarkupTemplate
     20
     21class HTML5Template(MarkupTemplate):
     22    """Implementation of the template language for HTML-based templates.
     23   
     24    >>> tmpl = HTML5Template('''<ul>
     25    ...   <li py:for="item in items">${item}</li>
     26    ... </ul>''')
     27    >>> print tmpl.generate(items=[1, 2, 3])
     28    <html xmlns="http://www.w3.org/1999/xhtml"><head/><body><ul>
     29      <li>1</li><li>2</li><li>3</li>
     30    </ul></body></html>
     31    """
     32
     33    HTML_NAMESPACE = Namespace('http://www.w3.org/1999/xhtml')
     34
     35    def __init__(self, source, basedir=None, filename=None, loader=None,
     36                 encoding=None):
     37        MarkupTemplate.__init__(self, source, basedir=basedir, filename=filename,
     38                                loader=loader, encoding=encoding)
     39
     40    def _parse(self, source, encoding):
     41        if not isinstance(source, Stream):
     42            source = HTML5Parser(source, filename=self.filename,
     43                                 encoding=encoding)
     44            source = Stream(source)
     45        source = source | self._to_xml
     46        return MarkupTemplate._parse(self, source, encoding)
     47
     48    def _to_xml(self, stream):
     49        """Transforms elements or attributes from HTML namespace (or no namespace) \
     50        whose name starts with py: or pi_ into XML Template directives.
     51       
     52        Also transforms elements from HTML namespace (or no namespace) whose name
     53        starts with xi: or xi_ into XInclude elements.
     54       
     55        TODO: support includes using <link> or <script> with a special rel="" or
     56        type="" value.
     57        """
     58        for kind, data, pos in stream:
     59            if kind is START:
     60                tag, attrs = data
     61                tag = QName(tag)
     62                if not tag.namespace or tag in self.HTML_NAMESPACE:
     63                    if tag.localname.startswith('py:') or tag.localname.startswith('py_'):
     64                        tag = self.DIRECTIVE_NAMESPACE[tag.localname[3:]]
     65                    elif tag.localname.startswith('xi:') or tag.localname.startswith('xi_'):
     66                        tag = self.XINCLUDE_NAMESPACE[tag.localname[3:]]
     67                new_attrs = []
     68                for name, value in attrs:
     69                    name = QName(name)
     70                    if not name.namespace or name in self.HTML_NAMESPACE:
     71                        if name.localname.startswith('py:') or name.localname.startswith('py_'):
     72                            name = self.DIRECTIVE_NAMESPACE[name.localname[3:]]
     73                    new_attrs.append((name, value))
     74                yield kind, (tag, Attrs(new_attrs)), pos
     75            elif kind is END:
     76                tag = QName(data)
     77                if not tag.namespace or tag in self.HTML_NAMESPACE:
     78                    if tag.startswith('py:'):
     79                        tag = self.DIRECTIVE_NAMESPACE[tag[3:]]
     80                yield kind, tag, pos
     81            else:
     82                yield kind, data, pos