Edgewall Software

Changeset 27 for trunk/markup/input.py


Ignore:
Timestamp:
Jun 28, 2006, 10:55:04 AM (17 years ago)
Author:
cmlenz
Message:
  • Split out the XPath tests into a separate unittest-based file.
  • Added many more docstrings.
  • Cleaned up the implementation of the XML/HTML parsers a bit.
  • The HTML parser now correctly handles minimized attributes.
  • Added COPYING and README files.
File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/markup/input.py

    r22 r27  
    1919import HTMLParser as html
    2020import htmlentitydefs
    21 import re
    2221from StringIO import StringIO
    2322
     
    3837class XMLParser(object):
    3938    """Generator-based XML parser based on roughly equivalent code in
    40     Kid/ElementTree."""
     39    Kid/ElementTree.
     40   
     41    The parsing is initiated by iterating over the parser object:
     42   
     43    >>> parser = XMLParser(StringIO('<root id="2"><child>Foo</child></root>'))
     44    >>> for kind, data, pos in parser:
     45    ...     print kind, data
     46    START (u'root', [(u'id', u'2')])
     47    START (u'child', [])
     48    TEXT Foo
     49    END child
     50    END root
     51    """
    4152
    4253    def __init__(self, source, filename=None):
     54        """Initialize the parser for the given XML text.
     55       
     56        @param source: the XML text as a file-like object
     57        @param filename: the name of the file, if appropriate
     58        """
    4359        self.source = source
    4460        self.filename = filename
     
    91107            raise ParseError(msg, self.filename, e.lineno, e.offset)
    92108
     109    def _enqueue(self, kind, data, pos=None):
     110        if pos is None:
     111            pos = self._getpos()
     112        self._queue.append((kind, data, pos))
     113
    93114    def _getpos_unknown(self):
    94115        return (self.filename or '<string>', -1, -1)
     
    99120
    100121    def _handle_start(self, tag, attrib):
    101         self._queue.append((Stream.START, (QName(tag), Attributes(attrib.items())),
    102                            self._getpos()))
     122        self._enqueue(Stream.START, (QName(tag), Attributes(attrib.items())))
    103123
    104124    def _handle_end(self, tag):
    105         self._queue.append((Stream.END, QName(tag), self._getpos()))
     125        self._enqueue(Stream.END, QName(tag))
    106126
    107127    def _handle_data(self, text):
    108         self._queue.append((Stream.TEXT, text, self._getpos()))
     128        self._enqueue(Stream.TEXT, text)
    109129
    110130    def _handle_prolog(self, version, encoding, standalone):
    111         self._queue.append((Stream.PROLOG, (version, encoding, standalone),
    112                            self._getpos()))
     131        self._enqueue(Stream.PROLOG, (version, encoding, standalone))
    113132
    114133    def _handle_doctype(self, name, sysid, pubid, has_internal_subset):
    115         self._queue.append((Stream.DOCTYPE, (name, pubid, sysid), self._getpos()))
     134        self._enqueue(Stream.DOCTYPE, (name, pubid, sysid))
    116135
    117136    def _handle_start_ns(self, prefix, uri):
    118         self._queue.append((Stream.START_NS, (prefix or '', uri), self._getpos()))
     137        self._enqueue(Stream.START_NS, (prefix or '', uri))
    119138
    120139    def _handle_end_ns(self, prefix):
    121         self._queue.append((Stream.END_NS, prefix or '', self._getpos()))
     140        self._enqueue(Stream.END_NS, prefix or '')
    122141
    123142    def _handle_pi(self, target, data):
    124         self._queue.append((Stream.PI, (target, data), self._getpos()))
     143        self._enqueue(Stream.PI, (target, data))
    125144
    126145    def _handle_comment(self, text):
    127         self._queue.append((Stream.COMMENT, text, self._getpos()))
     146        self._enqueue(Stream.COMMENT, text)
    128147
    129148    def _handle_other(self, text):
     
    132151            try:
    133152                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
    134                 self._queue.append((Stream.TEXT, text, self._getpos()))
     153                self._enqueue(Stream.TEXT, text)
    135154            except KeyError:
    136155                lineno, offset = self._getpos()
     
    148167    This class provides the same interface for generating stream events as
    149168    `XMLParser`, and attempts to automatically balance tags.
     169   
     170    The parsing is initiated by iterating over the parser object:
     171   
     172    >>> parser = HTMLParser(StringIO('<UL compact><LI>Foo</UL>'))
     173    >>> for kind, data, pos in parser:
     174    ...     print kind, data
     175    START (u'ul', [(u'compact', u'compact')])
     176    START (u'li', [])
     177    TEXT Foo
     178    END li
     179    END ul
    150180    """
    151181
     
    188218            raise ParseError(msg, self.filename, e.lineno, e.offset)
    189219
     220    def _enqueue(self, kind, data, pos=None):
     221        if pos is None:
     222            pos = self._getpos()
     223        self._queue.append((kind, data, pos))
     224
    190225    def _getpos(self):
    191226        lineno, column = self.getpos()
     
    193228
    194229    def handle_starttag(self, tag, attrib):
    195         pos = self._getpos()
    196         self._queue.append((Stream.START, (QName(tag), Attributes(attrib)), pos))
     230        fixed_attrib = []
     231        for name, value in attrib: # Fixup minimized attributes
     232            if value is None:
     233                value = name
     234            fixed_attrib.append((name, unicode(value)))
     235
     236        self._enqueue(Stream.START, (QName(tag), Attributes(fixed_attrib)))
    197237        if tag in self._EMPTY_ELEMS:
    198             self._queue.append((Stream.END, QName(tag), pos))
     238            self._enqueue(Stream.END, QName(tag))
    199239        else:
    200240            self._open_tags.append(tag)
     
    202242    def handle_endtag(self, tag):
    203243        if tag not in self._EMPTY_ELEMS:
    204             pos = self._getpos()
    205244            while self._open_tags:
    206245                open_tag = self._open_tags.pop()
    207246                if open_tag.lower() == tag.lower():
    208247                    break
    209                 self._queue.append((Stream.END, QName(open_tag), pos))
    210             self._queue.append((Stream.END, QName(tag), pos))
     248                self._enqueue(Stream.END, QName(open_tag))
     249            self._enqueue(Stream.END, QName(tag))
    211250
    212251    def handle_data(self, text):
    213         self._queue.append((Stream.TEXT, text, self._getpos()))
     252        self._enqueue(Stream.TEXT, text)
    214253
    215254    def handle_charref(self, name):
    216         self._queue.append((Stream.TEXT, Markup('&#%s;' % name), self._getpos()))
     255        self._enqueue(Stream.TEXT, Markup('&#%s;' % name))
    217256
    218257    def handle_entityref(self, name):
    219         self._queue.append((Stream.TEXT, Markup('&%s;' % name), self._getpos()))
     258        self._enqueue(Stream.TEXT, Markup('&%s;' % name))
    220259
    221260    def handle_pi(self, data):
    222261        target, data = data.split(maxsplit=1)
    223262        data = data.rstrip('?')
    224         self._queue.append((Stream.PI, (target.strip(), data.strip()),
    225                            self._getpos()))
     263        self._enqueue(Stream.PI, (target.strip(), data.strip()))
    226264
    227265    def handle_comment(self, text):
    228         self._queue.append((Stream.COMMENT, text, self._getpos()))
     266        self._enqueue(Stream.COMMENT, text)
    229267
    230268
Note: See TracChangeset for help on using the changeset viewer.