| 1 | # -*- coding: utf-8 -*- |
|---|
| 2 | # |
|---|
| 3 | # Copyright (C) 2006 Edgewall Software |
|---|
| 4 | # All rights reserved. |
|---|
| 5 | # |
|---|
| 6 | # This software is licensed as described in the file COPYING, which |
|---|
| 7 | # you should have received as part of this distribution. The terms |
|---|
| 8 | # are also available at http://genshi.edgewall.org/wiki/License. |
|---|
| 9 | # |
|---|
| 10 | # This software consists of voluntary contributions made by many |
|---|
| 11 | # individuals. For the exact contribution history, see the revision |
|---|
| 12 | # history and logs, available at http://genshi.edgewall.org/log/. |
|---|
| 13 | |
|---|
| 14 | from itertools import chain |
|---|
| 15 | from xml.parsers import expat |
|---|
| 16 | try: |
|---|
| 17 | frozenset |
|---|
| 18 | except NameError: |
|---|
| 19 | from sets import ImmutableSet as frozenset |
|---|
| 20 | import HTMLParser as html |
|---|
| 21 | import htmlentitydefs |
|---|
| 22 | from StringIO import StringIO |
|---|
| 23 | |
|---|
| 24 | from genshi.core import Attrs, QName, Stream, stripentities |
|---|
| 25 | from genshi.core import DOCTYPE, START, END, START_NS, END_NS, TEXT, \ |
|---|
| 26 | START_CDATA, END_CDATA, PI, COMMENT |
|---|
| 27 | |
|---|
| 28 | __all__ = ['ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML'] |
|---|
| 29 | |
|---|
| 30 | |
|---|
| 31 | class ParseError(Exception): |
|---|
| 32 | """Exception raised when fatal syntax errors are found in the input being |
|---|
| 33 | parsed.""" |
|---|
| 34 | |
|---|
| 35 | def __init__(self, message, filename='<string>', lineno=-1, offset=-1): |
|---|
| 36 | Exception.__init__(self, message) |
|---|
| 37 | self.msg = message |
|---|
| 38 | self.filename = filename |
|---|
| 39 | self.lineno = lineno |
|---|
| 40 | self.offset = offset |
|---|
| 41 | |
|---|
| 42 | |
|---|
| 43 | class XMLParser(object): |
|---|
| 44 | """Generator-based XML parser based on roughly equivalent code in |
|---|
| 45 | Kid/ElementTree. |
|---|
| 46 | |
|---|
| 47 | The parsing is initiated by iterating over the parser object: |
|---|
| 48 | |
|---|
| 49 | >>> parser = XMLParser(StringIO('<root id="2"><child>Foo</child></root>')) |
|---|
| 50 | >>> for kind, data, pos in parser: |
|---|
| 51 | ... print kind, data |
|---|
| 52 | START (u'root', [(u'id', u'2')]) |
|---|
| 53 | START (u'child', []) |
|---|
| 54 | TEXT Foo |
|---|
| 55 | END child |
|---|
| 56 | END root |
|---|
| 57 | """ |
|---|
| 58 | |
|---|
| 59 | _entitydefs = ['<!ENTITY %s "&#%d;">' % (name, value) for name, value in |
|---|
| 60 | htmlentitydefs.name2codepoint.items()] |
|---|
| 61 | _external_dtd = '\n'.join(_entitydefs) |
|---|
| 62 | |
|---|
| 63 | def __init__(self, source, filename=None): |
|---|
| 64 | """Initialize the parser for the given XML text. |
|---|
| 65 | |
|---|
| 66 | @param source: the XML text as a file-like object |
|---|
| 67 | @param filename: the name of the file, if appropriate |
|---|
| 68 | """ |
|---|
| 69 | self.source = source |
|---|
| 70 | self.filename = filename |
|---|
| 71 | |
|---|
| 72 | # Setup the Expat parser |
|---|
| 73 | parser = expat.ParserCreate('utf-8', '}') |
|---|
| 74 | parser.buffer_text = True |
|---|
| 75 | parser.returns_unicode = True |
|---|
| 76 | parser.ordered_attributes = True |
|---|
| 77 | |
|---|
| 78 | parser.StartElementHandler = self._handle_start |
|---|
| 79 | parser.EndElementHandler = self._handle_end |
|---|
| 80 | parser.CharacterDataHandler = self._handle_data |
|---|
| 81 | parser.StartDoctypeDeclHandler = self._handle_doctype |
|---|
| 82 | parser.StartNamespaceDeclHandler = self._handle_start_ns |
|---|
| 83 | parser.EndNamespaceDeclHandler = self._handle_end_ns |
|---|
| 84 | parser.StartCdataSectionHandler = self._handle_start_cdata |
|---|
| 85 | parser.EndCdataSectionHandler = self._handle_end_cdata |
|---|
| 86 | parser.ProcessingInstructionHandler = self._handle_pi |
|---|
| 87 | parser.CommentHandler = self._handle_comment |
|---|
| 88 | |
|---|
| 89 | # Tell Expat that we'll handle non-XML entities ourselves |
|---|
| 90 | # (in _handle_other) |
|---|
| 91 | parser.DefaultHandler = self._handle_other |
|---|
| 92 | parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) |
|---|
| 93 | parser.UseForeignDTD() |
|---|
| 94 | parser.ExternalEntityRefHandler = self._build_foreign |
|---|
| 95 | |
|---|
| 96 | # Location reporting is only support in Python >= 2.4 |
|---|
| 97 | if not hasattr(parser, 'CurrentLineNumber'): |
|---|
| 98 | self._getpos = self._getpos_unknown |
|---|
| 99 | |
|---|
| 100 | self.expat = parser |
|---|
| 101 | self._queue = [] |
|---|
| 102 | |
|---|
| 103 | def parse(self): |
|---|
| 104 | def _generate(): |
|---|
| 105 | try: |
|---|
| 106 | bufsize = 4 * 1024 # 4K |
|---|
| 107 | done = False |
|---|
| 108 | while 1: |
|---|
| 109 | while not done and len(self._queue) == 0: |
|---|
| 110 | data = self.source.read(bufsize) |
|---|
| 111 | if data == '': # end of data |
|---|
| 112 | if hasattr(self, 'expat'): |
|---|
| 113 | self.expat.Parse('', True) |
|---|
| 114 | del self.expat # get rid of circular references |
|---|
| 115 | done = True |
|---|
| 116 | else: |
|---|
| 117 | if isinstance(data, unicode): |
|---|
| 118 | data = data.encode('utf-8') |
|---|
| 119 | self.expat.Parse(data, False) |
|---|
| 120 | for event in self._queue: |
|---|
| 121 | yield event |
|---|
| 122 | self._queue = [] |
|---|
| 123 | if done: |
|---|
| 124 | break |
|---|
| 125 | except expat.ExpatError, e: |
|---|
| 126 | msg = str(e) |
|---|
| 127 | if self.filename: |
|---|
| 128 | msg += ', in ' + self.filename |
|---|
| 129 | raise ParseError(msg, self.filename, e.lineno, e.offset) |
|---|
| 130 | return Stream(_generate()).filter(_coalesce) |
|---|
| 131 | |
|---|
| 132 | def __iter__(self): |
|---|
| 133 | return iter(self.parse()) |
|---|
| 134 | |
|---|
| 135 | def _build_foreign(self, context, base, sysid, pubid): |
|---|
| 136 | parser = self.expat.ExternalEntityParserCreate(context) |
|---|
| 137 | parser.ParseFile(StringIO(self._external_dtd)) |
|---|
| 138 | return 1 |
|---|
| 139 | |
|---|
| 140 | def _enqueue(self, kind, data=None, pos=None): |
|---|
| 141 | if pos is None: |
|---|
| 142 | pos = self._getpos() |
|---|
| 143 | if kind is TEXT: |
|---|
| 144 | # Expat reports the *end* of the text event as current position. We |
|---|
| 145 | # try to fix that up here as much as possible. Unfortunately, the |
|---|
| 146 | # offset is only valid for single-line text. For multi-line text, |
|---|
| 147 | # it is apparently not possible to determine at what offset it |
|---|
| 148 | # started |
|---|
| 149 | if '\n' in data: |
|---|
| 150 | lines = data.splitlines() |
|---|
| 151 | lineno = pos[1] - len(lines) + 1 |
|---|
| 152 | offset = -1 |
|---|
| 153 | else: |
|---|
| 154 | lineno = pos[1] |
|---|
| 155 | offset = pos[2] - len(data) |
|---|
| 156 | pos = (pos[0], lineno, offset) |
|---|
| 157 | self._queue.append((kind, data, pos)) |
|---|
| 158 | |
|---|
| 159 | def _getpos_unknown(self): |
|---|
| 160 | return (self.filename, -1, -1) |
|---|
| 161 | |
|---|
| 162 | def _getpos(self): |
|---|
| 163 | return (self.filename, self.expat.CurrentLineNumber, |
|---|
| 164 | self.expat.CurrentColumnNumber) |
|---|
| 165 | |
|---|
| 166 | def _handle_start(self, tag, attrib): |
|---|
| 167 | self._enqueue(START, (QName(tag), Attrs(zip(*[iter(attrib)] * 2)))) |
|---|
| 168 | |
|---|
| 169 | def _handle_end(self, tag): |
|---|
| 170 | self._enqueue(END, QName(tag)) |
|---|
| 171 | |
|---|
| 172 | def _handle_data(self, text): |
|---|
| 173 | self._enqueue(TEXT, text) |
|---|
| 174 | |
|---|
| 175 | def _handle_doctype(self, name, sysid, pubid, has_internal_subset): |
|---|
| 176 | self._enqueue(DOCTYPE, (name, pubid, sysid)) |
|---|
| 177 | |
|---|
| 178 | def _handle_start_ns(self, prefix, uri): |
|---|
| 179 | self._enqueue(START_NS, (prefix or '', uri)) |
|---|
| 180 | |
|---|
| 181 | def _handle_end_ns(self, prefix): |
|---|
| 182 | self._enqueue(END_NS, prefix or '') |
|---|
| 183 | |
|---|
| 184 | def _handle_start_cdata(self): |
|---|
| 185 | self._enqueue(START_CDATA) |
|---|
| 186 | |
|---|
| 187 | def _handle_end_cdata(self): |
|---|
| 188 | self._enqueue(END_CDATA) |
|---|
| 189 | |
|---|
| 190 | def _handle_pi(self, target, data): |
|---|
| 191 | self._enqueue(PI, (target, data)) |
|---|
| 192 | |
|---|
| 193 | def _handle_comment(self, text): |
|---|
| 194 | self._enqueue(COMMENT, text) |
|---|
| 195 | |
|---|
| 196 | def _handle_other(self, text): |
|---|
| 197 | if text.startswith('&'): |
|---|
| 198 | # deal with undefined entities |
|---|
| 199 | try: |
|---|
| 200 | text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) |
|---|
| 201 | self._enqueue(TEXT, text) |
|---|
| 202 | except KeyError: |
|---|
| 203 | filename, lineno, offset = self._getpos() |
|---|
| 204 | error = expat.error('undefined entity "%s": line %d, column %d' |
|---|
| 205 | % (text, lineno, offset)) |
|---|
| 206 | error.code = expat.errors.XML_ERROR_UNDEFINED_ENTITY |
|---|
| 207 | error.lineno = lineno |
|---|
| 208 | error.offset = offset |
|---|
| 209 | raise error |
|---|
| 210 | |
|---|
| 211 | |
|---|
| 212 | def XML(text): |
|---|
| 213 | return Stream(list(XMLParser(StringIO(text)))) |
|---|
| 214 | |
|---|
| 215 | |
|---|
| 216 | class HTMLParser(html.HTMLParser, object): |
|---|
| 217 | """Parser for HTML input based on the Python `HTMLParser` module. |
|---|
| 218 | |
|---|
| 219 | This class provides the same interface for generating stream events as |
|---|
| 220 | `XMLParser`, and attempts to automatically balance tags. |
|---|
| 221 | |
|---|
| 222 | The parsing is initiated by iterating over the parser object: |
|---|
| 223 | |
|---|
| 224 | >>> parser = HTMLParser(StringIO('<UL compact><LI>Foo</UL>')) |
|---|
| 225 | >>> for kind, data, pos in parser: |
|---|
| 226 | ... print kind, data |
|---|
| 227 | START (u'ul', [(u'compact', u'compact')]) |
|---|
| 228 | START (u'li', []) |
|---|
| 229 | TEXT Foo |
|---|
| 230 | END li |
|---|
| 231 | END ul |
|---|
| 232 | """ |
|---|
| 233 | |
|---|
| 234 | _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', |
|---|
| 235 | 'hr', 'img', 'input', 'isindex', 'link', 'meta', |
|---|
| 236 | 'param']) |
|---|
| 237 | |
|---|
| 238 | def __init__(self, source, filename=None): |
|---|
| 239 | html.HTMLParser.__init__(self) |
|---|
| 240 | self.source = source |
|---|
| 241 | self.filename = filename |
|---|
| 242 | self._queue = [] |
|---|
| 243 | self._open_tags = [] |
|---|
| 244 | |
|---|
| 245 | def parse(self): |
|---|
| 246 | def _generate(): |
|---|
| 247 | try: |
|---|
| 248 | bufsize = 4 * 1024 # 4K |
|---|
| 249 | done = False |
|---|
| 250 | while 1: |
|---|
| 251 | while not done and len(self._queue) == 0: |
|---|
| 252 | data = self.source.read(bufsize) |
|---|
| 253 | if data == '': # end of data |
|---|
| 254 | self.close() |
|---|
| 255 | done = True |
|---|
| 256 | else: |
|---|
| 257 | self.feed(data) |
|---|
| 258 | for kind, data, pos in self._queue: |
|---|
| 259 | yield kind, data, pos |
|---|
| 260 | self._queue = [] |
|---|
| 261 | if done: |
|---|
| 262 | open_tags = self._open_tags |
|---|
| 263 | open_tags.reverse() |
|---|
| 264 | for tag in open_tags: |
|---|
| 265 | yield END, QName(tag), pos |
|---|
| 266 | break |
|---|
| 267 | except html.HTMLParseError, e: |
|---|
| 268 | msg = '%s: line %d, column %d' % (e.msg, e.lineno, e.offset) |
|---|
| 269 | if self.filename: |
|---|
| 270 | msg += ', in %s' % self.filename |
|---|
| 271 | raise ParseError(msg, self.filename, e.lineno, e.offset) |
|---|
| 272 | return Stream(_generate()).filter(_coalesce) |
|---|
| 273 | |
|---|
| 274 | def __iter__(self): |
|---|
| 275 | return iter(self.parse()) |
|---|
| 276 | |
|---|
| 277 | def _enqueue(self, kind, data, pos=None): |
|---|
| 278 | if pos is None: |
|---|
| 279 | pos = self._getpos() |
|---|
| 280 | self._queue.append((kind, data, pos)) |
|---|
| 281 | |
|---|
| 282 | def _getpos(self): |
|---|
| 283 | lineno, column = self.getpos() |
|---|
| 284 | return (self.filename, lineno, column) |
|---|
| 285 | |
|---|
| 286 | def handle_starttag(self, tag, attrib): |
|---|
| 287 | fixed_attrib = [] |
|---|
| 288 | for name, value in attrib: # Fixup minimized attributes |
|---|
| 289 | if value is None: |
|---|
| 290 | value = name |
|---|
| 291 | fixed_attrib.append((name, unicode(stripentities(value)))) |
|---|
| 292 | |
|---|
| 293 | self._enqueue(START, (QName(tag), Attrs(fixed_attrib))) |
|---|
| 294 | if tag in self._EMPTY_ELEMS: |
|---|
| 295 | self._enqueue(END, QName(tag)) |
|---|
| 296 | else: |
|---|
| 297 | self._open_tags.append(tag) |
|---|
| 298 | |
|---|
| 299 | def handle_endtag(self, tag): |
|---|
| 300 | if tag not in self._EMPTY_ELEMS: |
|---|
| 301 | while self._open_tags: |
|---|
| 302 | open_tag = self._open_tags.pop() |
|---|
| 303 | if open_tag.lower() == tag.lower(): |
|---|
| 304 | break |
|---|
| 305 | self._enqueue(END, QName(open_tag)) |
|---|
| 306 | self._enqueue(END, QName(tag)) |
|---|
| 307 | |
|---|
| 308 | def handle_data(self, text): |
|---|
| 309 | self._enqueue(TEXT, text) |
|---|
| 310 | |
|---|
| 311 | def handle_charref(self, name): |
|---|
| 312 | text = unichr(int(name)) |
|---|
| 313 | self._enqueue(TEXT, text) |
|---|
| 314 | |
|---|
| 315 | def handle_entityref(self, name): |
|---|
| 316 | try: |
|---|
| 317 | text = unichr(htmlentitydefs.name2codepoint[name]) |
|---|
| 318 | except KeyError: |
|---|
| 319 | text = '&%s;' % name |
|---|
| 320 | self._enqueue(TEXT, text) |
|---|
| 321 | |
|---|
| 322 | def handle_pi(self, data): |
|---|
| 323 | target, data = data.split(maxsplit=1) |
|---|
| 324 | data = data.rstrip('?') |
|---|
| 325 | self._enqueue(PI, (target.strip(), data.strip())) |
|---|
| 326 | |
|---|
| 327 | def handle_comment(self, text): |
|---|
| 328 | self._enqueue(COMMENT, text) |
|---|
| 329 | |
|---|
| 330 | |
|---|
| 331 | def HTML(text): |
|---|
| 332 | return Stream(list(HTMLParser(StringIO(text)))) |
|---|
| 333 | |
|---|
| 334 | def _coalesce(stream): |
|---|
| 335 | """Coalesces adjacent TEXT events into a single event.""" |
|---|
| 336 | textbuf = [] |
|---|
| 337 | textpos = None |
|---|
| 338 | for kind, data, pos in chain(stream, [(None, None, None)]): |
|---|
| 339 | if kind is TEXT: |
|---|
| 340 | textbuf.append(data) |
|---|
| 341 | if textpos is None: |
|---|
| 342 | textpos = pos |
|---|
| 343 | else: |
|---|
| 344 | if textbuf: |
|---|
| 345 | yield TEXT, u''.join(textbuf), textpos |
|---|
| 346 | del textbuf[:] |
|---|
| 347 | textpos = None |
|---|
| 348 | if kind: |
|---|
| 349 | yield kind, data, pos |
|---|