Changeset 1157 for trunk/genshi/input.py
- Timestamp:
- Mar 18, 2011, 10:08:12 AM (13 years ago)
- Location:
- trunk
- Files:
-
- 2 edited
-
. (modified) (1 prop)
-
genshi/input.py (modified) (12 diffs)
Legend:
- Unmodified
- Added
- Removed
-
trunk
- Property svn:mergeinfo changed
/branches/experimental/py3k merged: 1140
- Property svn:mergeinfo changed
-
trunk/genshi/input.py
r1082 r1157 19 19 import htmlentitydefs as entities 20 20 import HTMLParser as html 21 from StringIO import StringIO22 21 from xml.parsers import expat 23 22 … … 25 24 from genshi.core import START, END, XML_DECL, DOCTYPE, TEXT, START_NS, \ 26 25 END_NS, START_CDATA, END_CDATA, PI, COMMENT 26 from genshi.compat import StringIO, BytesIO 27 27 28 28 29 __all__ = ['ET', 'ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML'] … … 91 92 _entitydefs = ['<!ENTITY %s "&#%d;">' % (name, value) for name, value in 92 93 entities.name2codepoint.items()] 93 _external_dtd = '\n'.join(_entitydefs)94 _external_dtd = u'\n'.join(_entitydefs).encode('utf-8') 94 95 95 96 def __init__(self, source, filename=None, encoding=None): … … 109 110 parser = expat.ParserCreate(encoding, '}') 110 111 parser.buffer_text = True 111 parser.returns_unicode = True 112 # Python 3 does not have returns_unicode 113 if hasattr(parser, 'returns_unicode'): 114 parser.returns_unicode = True 112 115 parser.ordered_attributes = True 113 116 … … 147 150 while not done and len(self._queue) == 0: 148 151 data = self.source.read(bufsize) 149 if data == '': # end of data152 if not data: # end of data 150 153 if hasattr(self, 'expat'): 151 154 self.expat.Parse('', True) … … 171 174 def _build_foreign(self, context, base, sysid, pubid): 172 175 parser = self.expat.ExternalEntityParserCreate(context) 173 parser.ParseFile( StringIO(self._external_dtd))176 parser.ParseFile(BytesIO(self._external_dtd)) 174 177 return 1 175 178 … … 280 283 The parsing is initiated by iterating over the parser object: 281 284 282 >>> parser = HTMLParser( StringIO('<UL compact><LI>Foo</UL>'))285 >>> parser = HTMLParser(BytesIO(u'<UL compact><LI>Foo</UL>'.encode('utf-8')), encoding='utf-8') 283 286 >>> for kind, data, pos in parser: 284 287 ... print('%s %s' % (kind, data)) … … 294 297 'param']) 295 298 296 def __init__(self, source, filename=None, encoding= 'utf-8'):299 def __init__(self, source, filename=None, encoding=None): 297 300 """Initialize the parser for the given HTML input. 298 301 … … 321 324 while not done and len(self._queue) == 0: 322 325 data = self.source.read(bufsize) 323 if data == '': # end of data326 if not data: # end of data 324 327 self.close() 325 328 done = True 326 329 else: 330 if not isinstance(data, unicode): 331 # bytes 332 if self.encoding: 333 data = data.decode(self.encoding) 334 else: 335 raise UnicodeError("source returned bytes, but no encoding specified") 327 336 self.feed(data) 328 337 for kind, data, pos in self._queue: … … 404 413 405 414 406 def HTML(text, encoding= 'utf-8'):415 def HTML(text, encoding=None): 407 416 """Parse the given HTML source and return a markup stream. 408 417 … … 410 419 iterated over multiple times: 411 420 412 >>> html = HTML('<body><h1>Foo</h1></body>' )421 >>> html = HTML('<body><h1>Foo</h1></body>', encoding='utf-8') 413 422 >>> print(html) 414 423 <body><h1>Foo</h1></body> … … 423 432 fails 424 433 """ 425 return Stream(list(HTMLParser(StringIO(text), encoding=encoding))) 434 if isinstance(text, unicode): 435 return Stream(list(HTMLParser(StringIO(text), encoding=encoding))) 436 return Stream(list(HTMLParser(BytesIO(text), encoding=encoding))) 426 437 427 438
Note: See TracChangeset
for help on using the changeset viewer.
