Edgewall Software

Changeset 1189 for trunk/genshi/input.py


Ignore:
Timestamp:
Dec 29, 2012, 2:02:20 PM (11 years ago)
Author:
hodgestar
Message:

Fix parsing of multi-byte characters that occur on 4K boundaries of HTML files (fixes #538).

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/genshi/input.py

    r1157 r1189  
    1717
    1818from itertools import chain
     19import codecs
    1920import htmlentitydefs as entities
    2021import HTMLParser as html
     
    318319        """
    319320        def _generate():
     321            if self.encoding:
     322                reader = codecs.getreader(self.encoding)
     323                source = reader(self.source)
     324            else:
     325                source = self.source
    320326            try:
    321327                bufsize = 4 * 1024 # 4K
     
    323329                while 1:
    324330                    while not done and len(self._queue) == 0:
    325                         data = self.source.read(bufsize)
     331                        data = source.read(bufsize)
    326332                        if not data: # end of data
    327333                            self.close()
     
    329335                        else:
    330336                            if not isinstance(data, unicode):
    331                                 # bytes
    332                                 if self.encoding:
    333                                     data = data.decode(self.encoding)
    334                                 else:
    335                                     raise UnicodeError("source returned bytes, but no encoding specified")
     337                                raise UnicodeError("source returned bytes, but no encoding specified")
    336338                            self.feed(data)
    337339                    for kind, data, pos in self._queue:
     
    433435    """
    434436    if isinstance(text, unicode):
    435         return Stream(list(HTMLParser(StringIO(text), encoding=encoding)))
     437        # If it's unicode text the encoding should be set to None.
     438        # The option to pass in an incorrect encoding is for ease
     439        # of writing doctests that work in both Python 2.x and 3.x.
     440        return Stream(list(HTMLParser(StringIO(text), encoding=None)))
    436441    return Stream(list(HTMLParser(BytesIO(text), encoding=encoding)))
    437442
Note: See TracChangeset for help on using the changeset viewer.