Ticket #108: html5_support.diff
| File html5_support.diff, 8.9 KB (added by tbroyer, 17 years ago) |
|---|
-
genshi/__init__.py
26 26 pass 27 27 28 28 from genshi.core import * 29 from genshi.input import ParseError, XML, HTML 29 from genshi.input import ParseError, XML, HTML, HTML5 -
genshi/input.py
25 25 import htmlentitydefs 26 26 from StringIO import StringIO 27 27 28 from genshi.core import Attrs, QName, Stream, stripentities28 from genshi.core import Attrs, Namespace, QName, Stream, stripentities 29 29 from genshi.core import DOCTYPE, START, END, START_NS, END_NS, TEXT, \ 30 30 START_CDATA, END_CDATA, PI, COMMENT 31 31 32 __all__ = ['ET', 'ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML' ]32 __all__ = ['ET', 'ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML', 'HTML5Parser', 'HTML5'] 33 33 __docformat__ = 'restructuredtext en' 34 34 35 35 def ET(element): … … 426 426 """ 427 427 return Stream(list(HTMLParser(StringIO(text), encoding=encoding))) 428 428 429 class HTML5Parser(object): 430 """Parser for HTML input based on `html5lib`. 431 432 This class provides the same interface for generating stream events as 433 `XMLParser`. 434 435 The parsing is initiated by iterating over the parser object: 436 437 >>> parser = HTML5Parser(StringIO('<UL compact><LI>Foo</UL>')) 438 >>> for kind, data, pos in parser: 439 ... print kind, repr(data) 440 START (QName(u'html'), Attrs()) 441 START (QName(u'head'), Attrs()) 442 END QName(u'head') 443 START (QName(u'body'), Attrs()) 444 START (QName(u'ul'), Attrs([(QName(u'compact'), '')])) 445 START (QName(u'li'), Attrs()) 446 TEXT u'Foo' 447 END QName(u'li') 448 END QName(u'ul') 449 END QName(u'body') 450 END QName(u'html') 451 """ 452 453 html = Namespace('http://www.w3.org/1999/xhtml') 454 455 def __init__(self, source, filename=None, encoding=None, innerHTML=False): 456 """Initialize the parser for the given HTML input. 457 458 :param source: the HTML text as a file-like object 459 :param filename: the name of the file, if known 460 :param encoding: encoding of the file; ignored if the input is unicode 461 :param innerHTML: are we parsing in innerHTML mode (innerHTML=True is not yet supported by html5lib) 462 """ 463 self.source = source 464 self.filename = filename 465 self.encoding = encoding 466 self.innerHTML = innerHTML 467 import html5lib 468 self.parser = html5lib.HTMLParser() 469 470 def parse(self): 471 """Generator that parses the HTML source, yielding markup events. 472 473 :return: a markup event stream 474 """ 475 # TODO: Add some basic namespace support, e.g. convert known prefixes (py:, svg:, mathml:, smil:) to QNames 476 document = self.parser.parse(self.source, encoding=self.encoding, innerHTML=self.innerHTML) 477 return self._generate(document) 478 479 def __iter__(self): 480 return iter(self.parse()) 481 482 def _generate(self, element): 483 from html5lib.treebuilders.simpletree import Document, DocumentType, CommentNode, TextNode 484 485 pos = (self.filename, -1, -1) 486 487 if isinstance(element, Document): 488 for child in element.childNodes: 489 for kind, data, pos in self._generate(child): 490 yield kind, data, pos 491 492 elif isinstance(element, DocumentType): 493 yield DOCTYPE, (element.name, None, None), pos 494 495 elif isinstance(element, CommentNode): 496 yield COMMENT, element.data, pos 497 498 elif isinstance(element, TextNode): 499 yield TEXT, element.value, pos 500 501 else: # Element 502 tag_name = self.html[element.name] 503 attrs = Attrs([(self.html[attr], value) for attr, value in element.attributes.iteritems()]) 504 yield START, (tag_name, attrs), pos 505 for child in element.childNodes: 506 for kind, data, pos in self._generate(child): 507 yield kind, data, pos 508 yield END, tag_name, pos 509 510 511 def HTML5(text, encoding=None, strict=False, innerHTML=False): 512 """Parse the given HTML source and return a markup stream. 513 514 Unlike with `HTML5Parser`, the returned stream is reusable, meaning it can be 515 iterated over multiple times: 516 517 >>> html = HTML5('<body><h1>Foo</h1></body>') 518 >>> print html 519 <html xmlns="http://www.w3.org/1999/xhtml"><head/><body><h1>Foo</h1></body></html> 520 >>> print html.select('body/h1') 521 <h1 xmlns="http://www.w3.org/1999/xhtml">Foo</h1> 522 >>> print html.select('body/h1/text()') 523 Foo 524 525 :param text: the HTML source 526 :return: the parsed XML event stream 527 """ 528 return Stream(list(HTML5Parser(StringIO(text), encoding=encoding))) 529 429 530 def _coalesce(stream): 430 531 """Coalesces adjacent TEXT events into a single event.""" 431 532 textbuf = [] -
genshi/output.py
27 27 START_CDATA, END_CDATA, PI, COMMENT, XML_NAMESPACE 28 28 29 29 __all__ = ['DocType', 'XMLSerializer', 'XHTMLSerializer', 'HTMLSerializer', 30 'TextSerializer' ]30 'TextSerializer', 'HTML5Serializer'] 31 31 __docformat__ = 'restructuredtext en' 32 32 33 33 … … 53 53 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' 54 54 ) 55 55 XHTML = XHTML_STRICT 56 57 HTML5 = ('html', None, None) 56 58 57 59 58 60 class XMLSerializer(object): … … 321 323 yield Markup('<?%s %s?>' % data) 322 324 323 325 326 class HTML5Serializer(object): 327 _NOESCAPE_ELEMS = frozenset(['style', 'script', 'xmp', 'iframe', 'noembed', 328 'noframes', 'noscript']) 329 330 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'bgsound', 'br', 331 'col', 'embed', 'frame', 'hr', 'img', 'input', 332 'link', 'meta', 'param', 'spacer', 'wbr']) 333 334 def __init__(self, doctype=DocType.HTML5): 335 self.preamble = [] 336 if doctype: 337 self.preamble.append((DOCTYPE, doctype, (None, -1, -1))) 338 self.filters = [EmptyTagFilter(), NamespaceStripper('http://www.w3.org/1999/xhtml')] 339 340 def __call__(self, stream): 341 empty_elems = self._EMPTY_ELEMS 342 noescape_elems = self._NOESCAPE_ELEMS 343 have_doctype = False 344 noescape = None 345 skip_content = None 346 depth = 0 347 348 stream = chain(self.preamble, stream) 349 for filter_ in self.filters: 350 stream = filter_(stream) 351 for kind, data, pos in stream: 352 353 if kind is START or kind is EMPTY: 354 if kind is START: 355 depth += 1 356 tag, attrib = data 357 buf = ['<', tag.lower()] 358 for attr, value in attrib: 359 buf += [' ', attr.lower(), '="', self.escape(value), '"'] 360 buf.append('>') 361 if kind is EMPTY: 362 if tag not in empty_elems: 363 buf.append('</%s>' % tag) 364 yield Markup(u''.join(buf)) 365 if tag in noescape_elems: 366 noescape = depth 367 if tag in empty_elems: 368 skip_content = depth 369 370 elif kind is END: 371 yield Markup('</%s>' % data) 372 if noescape == depth: 373 noescape = None 374 if skip_content == depth: 375 skip_content = None 376 depth -= 1 377 378 elif kind is TEXT: 379 if noescape: 380 yield data 381 else: 382 yield self.escape(data) 383 384 elif kind is COMMENT: 385 yield Markup('<!-%s-->' % data) 386 387 elif kind is DOCTYPE and not have_doctype: 388 name, pubid, sysid = data 389 buf = ['<!DOCTYPE %s'] 390 if pubid: 391 buf.append(' PUBLIC "%s"') 392 elif sysid: 393 buf.append(' SYSTEM') 394 if sysid: 395 buf.append(' "%s"') 396 buf.append('>\n') 397 yield Markup(u''.join(buf), *filter(None, data)) 398 have_doctype = True 399 400 elif kind is PI: 401 # This is not valid HTML5 but looks like an SGML PI 402 yield Markup('<?%s %s>' % data) 403 404 def escape(text): 405 return unicode(text).replace('&', '&') \ 406 .replace('<', '<') \ 407 .replace('>', '>') \ 408 .replace('"', '"') 409 escape = staticmethod(escape) 410 411 324 412 class TextSerializer(object): 325 413 """Produces plain text from an event stream. 326 414
