Edgewall Software

Changeset 1157 for trunk/genshi/input.py


Ignore:
Timestamp:
Mar 18, 2011, 10:08:12 AM (13 years ago)
Author:
hodgestar
Message:

Merge r1140 from py3k:

add support for python 3 to core genshi components (genshi.core, genshi.input and genshi.output):

  • default input and output encodings changed from UTF-8 to None (i.e. unicode strings)
  • Namespace and QName objects do not call stringrepr in repr in Python 3 since repr() returns a unicode string there.
  • track changes to expat parser in Python 3 (mostly it accepts bytes instead of strings)
Location:
trunk
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk

  • trunk/genshi/input.py

    r1082 r1157  
    1919import htmlentitydefs as entities
    2020import HTMLParser as html
    21 from StringIO import StringIO
    2221from xml.parsers import expat
    2322
     
    2524from genshi.core import START, END, XML_DECL, DOCTYPE, TEXT, START_NS, \
    2625                        END_NS, START_CDATA, END_CDATA, PI, COMMENT
     26from genshi.compat import StringIO, BytesIO
     27
    2728
    2829__all__ = ['ET', 'ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML']
     
    9192    _entitydefs = ['<!ENTITY %s "&#%d;">' % (name, value) for name, value in
    9293                   entities.name2codepoint.items()]
    93     _external_dtd = '\n'.join(_entitydefs)
     94    _external_dtd = u'\n'.join(_entitydefs).encode('utf-8')
    9495
    9596    def __init__(self, source, filename=None, encoding=None):
     
    109110        parser = expat.ParserCreate(encoding, '}')
    110111        parser.buffer_text = True
    111         parser.returns_unicode = True
     112        # Python 3 does not have returns_unicode
     113        if hasattr(parser, 'returns_unicode'):
     114            parser.returns_unicode = True
    112115        parser.ordered_attributes = True
    113116
     
    147150                    while not done and len(self._queue) == 0:
    148151                        data = self.source.read(bufsize)
    149                         if data == '': # end of data
     152                        if not data: # end of data
    150153                            if hasattr(self, 'expat'):
    151154                                self.expat.Parse('', True)
     
    171174    def _build_foreign(self, context, base, sysid, pubid):
    172175        parser = self.expat.ExternalEntityParserCreate(context)
    173         parser.ParseFile(StringIO(self._external_dtd))
     176        parser.ParseFile(BytesIO(self._external_dtd))
    174177        return 1
    175178
     
    280283    The parsing is initiated by iterating over the parser object:
    281284   
    282     >>> parser = HTMLParser(StringIO('<UL compact><LI>Foo</UL>'))
     285    >>> parser = HTMLParser(BytesIO(u'<UL compact><LI>Foo</UL>'.encode('utf-8')), encoding='utf-8')
    283286    >>> for kind, data, pos in parser:
    284287    ...     print('%s %s' % (kind, data))
     
    294297                              'param'])
    295298
    296     def __init__(self, source, filename=None, encoding='utf-8'):
     299    def __init__(self, source, filename=None, encoding=None):
    297300        """Initialize the parser for the given HTML input.
    298301       
     
    321324                    while not done and len(self._queue) == 0:
    322325                        data = self.source.read(bufsize)
    323                         if data == '': # end of data
     326                        if not data: # end of data
    324327                            self.close()
    325328                            done = True
    326329                        else:
     330                            if not isinstance(data, unicode):
     331                                # bytes
     332                                if self.encoding:
     333                                    data = data.decode(self.encoding)
     334                                else:
     335                                    raise UnicodeError("source returned bytes, but no encoding specified")
    327336                            self.feed(data)
    328337                    for kind, data, pos in self._queue:
     
    404413
    405414
    406 def HTML(text, encoding='utf-8'):
     415def HTML(text, encoding=None):
    407416    """Parse the given HTML source and return a markup stream.
    408417   
     
    410419    iterated over multiple times:
    411420   
    412     >>> html = HTML('<body><h1>Foo</h1></body>')
     421    >>> html = HTML('<body><h1>Foo</h1></body>', encoding='utf-8')
    413422    >>> print(html)
    414423    <body><h1>Foo</h1></body>
     
    423432                        fails
    424433    """
    425     return Stream(list(HTMLParser(StringIO(text), encoding=encoding)))
     434    if isinstance(text, unicode):
     435        return Stream(list(HTMLParser(StringIO(text), encoding=encoding)))
     436    return Stream(list(HTMLParser(BytesIO(text), encoding=encoding)))
    426437
    427438
Note: See TracChangeset for help on using the changeset viewer.