Index: genshi/__init__.py
===================================================================
--- genshi/__init__.py	(révision 527)
+++ genshi/__init__.py	(copie de travail)
@@ -26,4 +26,4 @@
     pass
 
 from genshi.core import *
-from genshi.input import ParseError, XML, HTML
+from genshi.input import ParseError, XML, HTML, HTML5
Index: genshi/input.py
===================================================================
--- genshi/input.py	(révision 527)
+++ genshi/input.py	(copie de travail)
@@ -25,11 +25,11 @@
 import htmlentitydefs
 from StringIO import StringIO
 
-from genshi.core import Attrs, QName, Stream, stripentities
+from genshi.core import Attrs, Namespace, QName, Stream, stripentities
 from genshi.core import DOCTYPE, START, END, START_NS, END_NS, TEXT, \
                         START_CDATA, END_CDATA, PI, COMMENT
 
-__all__ = ['ET', 'ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML']
+__all__ = ['ET', 'ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML', 'HTML5Parser', 'HTML5']
 __docformat__ = 'restructuredtext en'
 
 def ET(element):
@@ -426,6 +426,107 @@
     """
     return Stream(list(HTMLParser(StringIO(text), encoding=encoding)))
 
+class HTML5Parser(object):
+    """Parser for HTML input based on `html5lib`.
+    
+    This class provides the same interface for generating stream events as
+    `XMLParser`.
+    
+    The parsing is initiated by iterating over the parser object:
+    
+    >>> parser = HTML5Parser(StringIO('<UL compact><LI>Foo</UL>'))
+    >>> for kind, data, pos in parser:
+    ...     print kind, repr(data)
+    START (QName(u'html'), Attrs())
+    START (QName(u'head'), Attrs())
+    END QName(u'head')
+    START (QName(u'body'), Attrs())
+    START (QName(u'ul'), Attrs([(QName(u'compact'), '')]))
+    START (QName(u'li'), Attrs())
+    TEXT u'Foo'
+    END QName(u'li')
+    END QName(u'ul')
+    END QName(u'body')
+    END QName(u'html')
+    """
+    
+    html = Namespace('http://www.w3.org/1999/xhtml')
+
+    def __init__(self, source, filename=None, encoding=None, innerHTML=False):
+        """Initialize the parser for the given HTML input.
+        
+        :param source: the HTML text as a file-like object
+        :param filename: the name of the file, if known
+        :param encoding: encoding of the file; ignored if the input is unicode
+        :param innerHTML: are we parsing in innerHTML mode (innerHTML=True is not yet supported by html5lib)
+        """
+        self.source = source
+        self.filename = filename
+        self.encoding = encoding
+        self.innerHTML = innerHTML
+        import html5lib
+        self.parser = html5lib.HTMLParser()
+
+    def parse(self):
+        """Generator that parses the HTML source, yielding markup events.
+        
+        :return: a markup event stream
+        """
+        # TODO: Add some basic namespace support, e.g. convert known prefixes (py:, svg:, mathml:, smil:) to QNames
+        document = self.parser.parse(self.source, encoding=self.encoding, innerHTML=self.innerHTML)
+        return self._generate(document)
+
+    def __iter__(self):
+        return iter(self.parse())
+    
+    def _generate(self, element):
+        from html5lib.treebuilders.simpletree import Document, DocumentType, CommentNode, TextNode
+
+        pos = (self.filename, -1, -1)
+
+        if isinstance(element, Document):
+            for child in element.childNodes:
+                for kind, data, pos in self._generate(child):
+                    yield kind, data, pos
+
+        elif isinstance(element, DocumentType):
+            yield DOCTYPE, (element.name, None, None), pos
+
+        elif isinstance(element, CommentNode):
+            yield COMMENT, element.data, pos
+
+        elif isinstance(element, TextNode):
+            yield TEXT, element.value, pos
+
+        else: # Element
+            tag_name = self.html[element.name]
+            attrs = Attrs([(self.html[attr], value) for attr, value in element.attributes.iteritems()])
+            yield START, (tag_name, attrs), pos
+            for child in element.childNodes:
+                for kind, data, pos in self._generate(child):
+                    yield kind, data, pos
+            yield END, tag_name, pos
+
+
+def HTML5(text, encoding=None, strict=False, innerHTML=False):
+    """Parse the given HTML source and return a markup stream.
+    
+    Unlike with `HTML5Parser`, the returned stream is reusable, meaning it can be
+    iterated over multiple times:
+    
+    >>> html = HTML5('<body><h1>Foo</h1></body>')
+    >>> print html
+    <html xmlns="http://www.w3.org/1999/xhtml"><head/><body><h1>Foo</h1></body></html>
+    >>> print html.select('body/h1')
+    <h1 xmlns="http://www.w3.org/1999/xhtml">Foo</h1>
+    >>> print html.select('body/h1/text()')
+    Foo
+    
+    :param text: the HTML source
+    :return: the parsed XML event stream
+    """
+    return Stream(list(HTML5Parser(StringIO(text), encoding=encoding)))
+
 def _coalesce(stream):
     """Coalesces adjacent TEXT events into a single event."""
     textbuf = []
Index: genshi/output.py
===================================================================
--- genshi/output.py	(révision 527)
+++ genshi/output.py	(copie de travail)
@@ -27,7 +27,7 @@
                         START_CDATA, END_CDATA, PI, COMMENT, XML_NAMESPACE
 
 __all__ = ['DocType', 'XMLSerializer', 'XHTMLSerializer', 'HTMLSerializer',
-           'TextSerializer']
+           'TextSerializer', 'HTML5Serializer']
 __docformat__ = 'restructuredtext en'
 
 
@@ -53,6 +53,8 @@
         'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
     )
     XHTML = XHTML_STRICT
+    
+    HTML5 = ('html', None, None)
 
 
 class XMLSerializer(object):
@@ -321,6 +323,92 @@
                 yield Markup('<?%s %s?>' % data)
 
 
+class HTML5Serializer(object):
+    _NOESCAPE_ELEMS = frozenset(['style', 'script', 'xmp', 'iframe', 'noembed',
+                                  'noframes', 'noscript'])
+
+    _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'bgsound', 'br',
+                              'col', 'embed', 'frame', 'hr', 'img', 'input',
+                              'link', 'meta', 'param', 'spacer', 'wbr'])
+
+    def __init__(self, doctype=DocType.HTML5):
+        self.preamble = []
+        if doctype:
+            self.preamble.append((DOCTYPE, doctype, (None, -1, -1)))
+        self.filters = [EmptyTagFilter(), NamespaceStripper('http://www.w3.org/1999/xhtml')]
+    
+    def __call__(self, stream):
+        empty_elems = self._EMPTY_ELEMS
+        noescape_elems = self._NOESCAPE_ELEMS
+        have_doctype = False
+        noescape = None
+        skip_content = None
+        depth = 0
+
+        stream = chain(self.preamble, stream)
+        for filter_ in self.filters:
+            stream = filter_(stream)
+        for kind, data, pos in stream:
+
+            if kind is START or kind is EMPTY:
+                if kind is START:
+                    depth += 1
+                tag, attrib = data
+                buf = ['<', tag.lower()]
+                for attr, value in attrib:
+                    buf += [' ', attr.lower(), '="', self.escape(value), '"']
+                buf.append('>')
+                if kind is EMPTY:
+                    if tag not in empty_elems:
+                        buf.append('</%s>' % tag)
+                yield Markup(u''.join(buf))
+                if tag in noescape_elems:
+                    noescape = depth
+                if tag in empty_elems:
+                    skip_content = depth
+
+            elif kind is END:
+                yield Markup('</%s>' % data)
+                if noescape == depth:
+                    noescape = None
+                if skip_content == depth:
+                    skip_content = None
+                depth -= 1
+
+            elif kind is TEXT:
+                if noescape:
+                    yield data
+                else:
+                    yield self.escape(data)
+
+            elif kind is COMMENT:
+                yield Markup('<!-%s-->' % data)
+
+            elif kind is DOCTYPE and not have_doctype:
+                name, pubid, sysid = data
+                buf = ['<!DOCTYPE %s']
+                if pubid:
+                    buf.append(' PUBLIC "%s"')
+                elif sysid:
+                    buf.append(' SYSTEM')
+                if sysid:
+                    buf.append(' "%s"')
+                buf.append('>\n')
+                yield Markup(u''.join(buf), *filter(None, data))
+                have_doctype = True
+
+            elif kind is PI:
+                # This is not valid HTML5 but looks like an SGML PI
+                yield Markup('<?%s %s>' % data)
+
+    def escape(text):
+        return unicode(text).replace('&', '&amp;') \
+                             .replace('<', '&lt;') \
+                             .replace('>', '&gt;') \
+                             .replace('"', '&quot;')
+    escape = staticmethod(escape)
+
+
 class TextSerializer(object):
     """Produces plain text from an event stream.
     
