1 | # -*- coding: utf-8 -*- |
---|
2 | # |
---|
3 | # Copyright (C) 2006-2007 Edgewall Software |
---|
4 | # All rights reserved. |
---|
5 | # |
---|
6 | # This software is licensed as described in the file COPYING, which |
---|
7 | # you should have received as part of this distribution. The terms |
---|
8 | # are also available at http://genshi.edgewall.org/wiki/License. |
---|
9 | # |
---|
10 | # This software consists of voluntary contributions made by many |
---|
11 | # individuals. For the exact contribution history, see the revision |
---|
12 | # history and logs, available at http://genshi.edgewall.org/log/. |
---|
13 | |
---|
14 | """Support for constructing markup streams from files, strings, or other |
---|
15 | sources. |
---|
16 | """ |
---|
17 | |
---|
18 | from itertools import chain |
---|
19 | from xml.parsers import expat |
---|
20 | try: |
---|
21 | frozenset |
---|
22 | except NameError: |
---|
23 | from sets import ImmutableSet as frozenset |
---|
24 | import HTMLParser as html |
---|
25 | import htmlentitydefs |
---|
26 | from StringIO import StringIO |
---|
27 | |
---|
28 | from genshi.core import Attrs, QName, Stream, stripentities |
---|
29 | from genshi.core import START, END, XML_DECL, DOCTYPE, TEXT, START_NS, END_NS, \ |
---|
30 | START_CDATA, END_CDATA, PI, COMMENT |
---|
31 | |
---|
32 | __all__ = ['ET', 'ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML'] |
---|
33 | __docformat__ = 'restructuredtext en' |
---|
34 | |
---|
35 | def ET(element): |
---|
36 | """Convert a given ElementTree element to a markup stream. |
---|
37 | |
---|
38 | :param element: an ElementTree element |
---|
39 | :return: a markup stream |
---|
40 | """ |
---|
41 | tag_name = QName(element.tag.lstrip('{')) |
---|
42 | attrs = Attrs([(QName(attr.lstrip('{')), value) |
---|
43 | for attr, value in element.items()]) |
---|
44 | |
---|
45 | yield START, (tag_name, attrs), (None, -1, -1) |
---|
46 | if element.text: |
---|
47 | yield TEXT, element.text, (None, -1, -1) |
---|
48 | for child in element.getchildren(): |
---|
49 | for item in ET(child): |
---|
50 | yield item |
---|
51 | yield END, tag_name, (None, -1, -1) |
---|
52 | if element.tail: |
---|
53 | yield TEXT, element.tail, (None, -1, -1) |
---|
54 | |
---|
55 | |
---|
56 | class ParseError(Exception): |
---|
57 | """Exception raised when fatal syntax errors are found in the input being |
---|
58 | parsed. |
---|
59 | """ |
---|
60 | |
---|
61 | def __init__(self, message, filename=None, lineno=-1, offset=-1): |
---|
62 | """Exception initializer. |
---|
63 | |
---|
64 | :param message: the error message from the parser |
---|
65 | :param filename: the path to the file that was parsed |
---|
66 | :param lineno: the number of the line on which the error was encountered |
---|
67 | :param offset: the column number where the error was encountered |
---|
68 | """ |
---|
69 | self.msg = message |
---|
70 | if filename: |
---|
71 | message += ', in ' + filename |
---|
72 | Exception.__init__(self, message) |
---|
73 | self.filename = filename or '<string>' |
---|
74 | self.lineno = lineno |
---|
75 | self.offset = offset |
---|
76 | |
---|
77 | |
---|
78 | class XMLParser(object): |
---|
79 | """Generator-based XML parser based on roughly equivalent code in |
---|
80 | Kid/ElementTree. |
---|
81 | |
---|
82 | The parsing is initiated by iterating over the parser object: |
---|
83 | |
---|
84 | >>> parser = XMLParser(StringIO('<root id="2"><child>Foo</child></root>')) |
---|
85 | >>> for kind, data, pos in parser: |
---|
86 | ... print kind, data |
---|
87 | START (QName(u'root'), Attrs([(QName(u'id'), u'2')])) |
---|
88 | START (QName(u'child'), Attrs()) |
---|
89 | TEXT Foo |
---|
90 | END child |
---|
91 | END root |
---|
92 | """ |
---|
93 | |
---|
94 | _entitydefs = ['<!ENTITY %s "&#%d;">' % (name, value) for name, value in |
---|
95 | htmlentitydefs.name2codepoint.items()] |
---|
96 | _external_dtd = '\n'.join(_entitydefs) |
---|
97 | |
---|
98 | def __init__(self, source, filename=None, encoding=None): |
---|
99 | """Initialize the parser for the given XML input. |
---|
100 | |
---|
101 | :param source: the XML text as a file-like object |
---|
102 | :param filename: the name of the file, if appropriate |
---|
103 | :param encoding: the encoding of the file; if not specified, the |
---|
104 | encoding is assumed to be ASCII, UTF-8, or UTF-16, or |
---|
105 | whatever the encoding specified in the XML declaration |
---|
106 | (if any) |
---|
107 | """ |
---|
108 | self.source = source |
---|
109 | self.filename = filename |
---|
110 | |
---|
111 | # Setup the Expat parser |
---|
112 | parser = expat.ParserCreate(encoding, '}') |
---|
113 | parser.buffer_text = True |
---|
114 | parser.returns_unicode = True |
---|
115 | parser.ordered_attributes = True |
---|
116 | |
---|
117 | parser.StartElementHandler = self._handle_start |
---|
118 | parser.EndElementHandler = self._handle_end |
---|
119 | parser.CharacterDataHandler = self._handle_data |
---|
120 | parser.StartDoctypeDeclHandler = self._handle_doctype |
---|
121 | parser.StartNamespaceDeclHandler = self._handle_start_ns |
---|
122 | parser.EndNamespaceDeclHandler = self._handle_end_ns |
---|
123 | parser.StartCdataSectionHandler = self._handle_start_cdata |
---|
124 | parser.EndCdataSectionHandler = self._handle_end_cdata |
---|
125 | parser.ProcessingInstructionHandler = self._handle_pi |
---|
126 | parser.XmlDeclHandler = self._handle_xml_decl |
---|
127 | parser.CommentHandler = self._handle_comment |
---|
128 | |
---|
129 | # Tell Expat that we'll handle non-XML entities ourselves |
---|
130 | # (in _handle_other) |
---|
131 | parser.DefaultHandler = self._handle_other |
---|
132 | parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) |
---|
133 | parser.UseForeignDTD() |
---|
134 | parser.ExternalEntityRefHandler = self._build_foreign |
---|
135 | |
---|
136 | # Location reporting is only support in Python >= 2.4 |
---|
137 | if not hasattr(parser, 'CurrentLineNumber'): |
---|
138 | self._getpos = self._getpos_unknown |
---|
139 | |
---|
140 | self.expat = parser |
---|
141 | self._queue = [] |
---|
142 | |
---|
143 | def parse(self): |
---|
144 | """Generator that parses the XML source, yielding markup events. |
---|
145 | |
---|
146 | :return: a markup event stream |
---|
147 | :raises ParseError: if the XML text is not well formed |
---|
148 | """ |
---|
149 | def _generate(): |
---|
150 | try: |
---|
151 | bufsize = 4 * 1024 # 4K |
---|
152 | done = False |
---|
153 | while 1: |
---|
154 | while not done and len(self._queue) == 0: |
---|
155 | data = self.source.read(bufsize) |
---|
156 | if data == '': # end of data |
---|
157 | if hasattr(self, 'expat'): |
---|
158 | self.expat.Parse('', True) |
---|
159 | del self.expat # get rid of circular references |
---|
160 | done = True |
---|
161 | else: |
---|
162 | if isinstance(data, unicode): |
---|
163 | data = data.encode('utf-8') |
---|
164 | self.expat.Parse(data, False) |
---|
165 | for event in self._queue: |
---|
166 | yield event |
---|
167 | self._queue = [] |
---|
168 | if done: |
---|
169 | break |
---|
170 | except expat.ExpatError, e: |
---|
171 | msg = str(e) |
---|
172 | raise ParseError(msg, self.filename, e.lineno, e.offset) |
---|
173 | return Stream(_generate()).filter(_coalesce) |
---|
174 | |
---|
175 | def __iter__(self): |
---|
176 | return iter(self.parse()) |
---|
177 | |
---|
178 | def _build_foreign(self, context, base, sysid, pubid): |
---|
179 | parser = self.expat.ExternalEntityParserCreate(context) |
---|
180 | parser.ParseFile(StringIO(self._external_dtd)) |
---|
181 | return 1 |
---|
182 | |
---|
183 | def _enqueue(self, kind, data=None, pos=None): |
---|
184 | if pos is None: |
---|
185 | pos = self._getpos() |
---|
186 | if kind is TEXT: |
---|
187 | # Expat reports the *end* of the text event as current position. We |
---|
188 | # try to fix that up here as much as possible. Unfortunately, the |
---|
189 | # offset is only valid for single-line text. For multi-line text, |
---|
190 | # it is apparently not possible to determine at what offset it |
---|
191 | # started |
---|
192 | if '\n' in data: |
---|
193 | lines = data.splitlines() |
---|
194 | lineno = pos[1] - len(lines) + 1 |
---|
195 | offset = -1 |
---|
196 | else: |
---|
197 | lineno = pos[1] |
---|
198 | offset = pos[2] - len(data) |
---|
199 | pos = (pos[0], lineno, offset) |
---|
200 | self._queue.append((kind, data, pos)) |
---|
201 | |
---|
202 | def _getpos_unknown(self): |
---|
203 | return (self.filename, -1, -1) |
---|
204 | |
---|
205 | def _getpos(self): |
---|
206 | return (self.filename, self.expat.CurrentLineNumber, |
---|
207 | self.expat.CurrentColumnNumber) |
---|
208 | |
---|
209 | def _handle_start(self, tag, attrib): |
---|
210 | attrs = Attrs([(QName(name), value) for name, value in |
---|
211 | zip(*[iter(attrib)] * 2)]) |
---|
212 | self._enqueue(START, (QName(tag), attrs)) |
---|
213 | |
---|
214 | def _handle_end(self, tag): |
---|
215 | self._enqueue(END, QName(tag)) |
---|
216 | |
---|
217 | def _handle_data(self, text): |
---|
218 | self._enqueue(TEXT, text) |
---|
219 | |
---|
220 | def _handle_xml_decl(self, version, encoding, standalone): |
---|
221 | self._enqueue(XML_DECL, (version, encoding, standalone)) |
---|
222 | |
---|
223 | def _handle_doctype(self, name, sysid, pubid, has_internal_subset): |
---|
224 | self._enqueue(DOCTYPE, (name, pubid, sysid)) |
---|
225 | |
---|
226 | def _handle_start_ns(self, prefix, uri): |
---|
227 | self._enqueue(START_NS, (prefix or '', uri)) |
---|
228 | |
---|
229 | def _handle_end_ns(self, prefix): |
---|
230 | self._enqueue(END_NS, prefix or '') |
---|
231 | |
---|
232 | def _handle_start_cdata(self): |
---|
233 | self._enqueue(START_CDATA) |
---|
234 | |
---|
235 | def _handle_end_cdata(self): |
---|
236 | self._enqueue(END_CDATA) |
---|
237 | |
---|
238 | def _handle_pi(self, target, data): |
---|
239 | self._enqueue(PI, (target, data)) |
---|
240 | |
---|
241 | def _handle_comment(self, text): |
---|
242 | self._enqueue(COMMENT, text) |
---|
243 | |
---|
244 | def _handle_other(self, text): |
---|
245 | if text.startswith('&'): |
---|
246 | # deal with undefined entities |
---|
247 | try: |
---|
248 | text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) |
---|
249 | self._enqueue(TEXT, text) |
---|
250 | except KeyError: |
---|
251 | filename, lineno, offset = self._getpos() |
---|
252 | error = expat.error('undefined entity "%s": line %d, column %d' |
---|
253 | % (text, lineno, offset)) |
---|
254 | error.code = expat.errors.XML_ERROR_UNDEFINED_ENTITY |
---|
255 | error.lineno = lineno |
---|
256 | error.offset = offset |
---|
257 | raise error |
---|
258 | |
---|
259 | |
---|
260 | def XML(text): |
---|
261 | """Parse the given XML source and return a markup stream. |
---|
262 | |
---|
263 | Unlike with `XMLParser`, the returned stream is reusable, meaning it can be |
---|
264 | iterated over multiple times: |
---|
265 | |
---|
266 | >>> xml = XML('<doc><elem>Foo</elem><elem>Bar</elem></doc>') |
---|
267 | >>> print xml |
---|
268 | <doc><elem>Foo</elem><elem>Bar</elem></doc> |
---|
269 | >>> print xml.select('elem') |
---|
270 | <elem>Foo</elem><elem>Bar</elem> |
---|
271 | >>> print xml.select('elem/text()') |
---|
272 | FooBar |
---|
273 | |
---|
274 | :param text: the XML source |
---|
275 | :return: the parsed XML event stream |
---|
276 | :raises ParseError: if the XML text is not well-formed |
---|
277 | """ |
---|
278 | return Stream(list(XMLParser(StringIO(text)))) |
---|
279 | |
---|
280 | |
---|
281 | class HTMLParser(html.HTMLParser, object): |
---|
282 | """Parser for HTML input based on the Python `HTMLParser` module. |
---|
283 | |
---|
284 | This class provides the same interface for generating stream events as |
---|
285 | `XMLParser`, and attempts to automatically balance tags. |
---|
286 | |
---|
287 | The parsing is initiated by iterating over the parser object: |
---|
288 | |
---|
289 | >>> parser = HTMLParser(StringIO('<UL compact><LI>Foo</UL>')) |
---|
290 | >>> for kind, data, pos in parser: |
---|
291 | ... print kind, data |
---|
292 | START (QName(u'ul'), Attrs([(QName(u'compact'), u'compact')])) |
---|
293 | START (QName(u'li'), Attrs()) |
---|
294 | TEXT Foo |
---|
295 | END li |
---|
296 | END ul |
---|
297 | """ |
---|
298 | |
---|
299 | _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', |
---|
300 | 'hr', 'img', 'input', 'isindex', 'link', 'meta', |
---|
301 | 'param']) |
---|
302 | |
---|
303 | def __init__(self, source, filename=None, encoding='utf-8'): |
---|
304 | """Initialize the parser for the given HTML input. |
---|
305 | |
---|
306 | :param source: the HTML text as a file-like object |
---|
307 | :param filename: the name of the file, if known |
---|
308 | :param filename: encoding of the file; ignored if the input is unicode |
---|
309 | """ |
---|
310 | html.HTMLParser.__init__(self) |
---|
311 | self.source = source |
---|
312 | self.filename = filename |
---|
313 | self.encoding = encoding |
---|
314 | self._queue = [] |
---|
315 | self._open_tags = [] |
---|
316 | |
---|
317 | def parse(self): |
---|
318 | """Generator that parses the HTML source, yielding markup events. |
---|
319 | |
---|
320 | :return: a markup event stream |
---|
321 | :raises ParseError: if the HTML text is not well formed |
---|
322 | """ |
---|
323 | def _generate(): |
---|
324 | try: |
---|
325 | bufsize = 4 * 1024 # 4K |
---|
326 | done = False |
---|
327 | while 1: |
---|
328 | while not done and len(self._queue) == 0: |
---|
329 | data = self.source.read(bufsize) |
---|
330 | if data == '': # end of data |
---|
331 | self.close() |
---|
332 | done = True |
---|
333 | else: |
---|
334 | self.feed(data) |
---|
335 | for kind, data, pos in self._queue: |
---|
336 | yield kind, data, pos |
---|
337 | self._queue = [] |
---|
338 | if done: |
---|
339 | open_tags = self._open_tags |
---|
340 | open_tags.reverse() |
---|
341 | for tag in open_tags: |
---|
342 | yield END, QName(tag), pos |
---|
343 | break |
---|
344 | except html.HTMLParseError, e: |
---|
345 | msg = '%s: line %d, column %d' % (e.msg, e.lineno, e.offset) |
---|
346 | raise ParseError(msg, self.filename, e.lineno, e.offset) |
---|
347 | return Stream(_generate()).filter(_coalesce) |
---|
348 | |
---|
349 | def __iter__(self): |
---|
350 | return iter(self.parse()) |
---|
351 | |
---|
352 | def _enqueue(self, kind, data, pos=None): |
---|
353 | if pos is None: |
---|
354 | pos = self._getpos() |
---|
355 | self._queue.append((kind, data, pos)) |
---|
356 | |
---|
357 | def _getpos(self): |
---|
358 | lineno, column = self.getpos() |
---|
359 | return (self.filename, lineno, column) |
---|
360 | |
---|
361 | def handle_starttag(self, tag, attrib): |
---|
362 | fixed_attrib = [] |
---|
363 | for name, value in attrib: # Fixup minimized attributes |
---|
364 | if value is None: |
---|
365 | value = unicode(name) |
---|
366 | elif not isinstance(value, unicode): |
---|
367 | value = value.decode(self.encoding, 'replace') |
---|
368 | fixed_attrib.append((QName(name), stripentities(value))) |
---|
369 | |
---|
370 | self._enqueue(START, (QName(tag), Attrs(fixed_attrib))) |
---|
371 | if tag in self._EMPTY_ELEMS: |
---|
372 | self._enqueue(END, QName(tag)) |
---|
373 | else: |
---|
374 | self._open_tags.append(tag) |
---|
375 | |
---|
376 | def handle_endtag(self, tag): |
---|
377 | if tag not in self._EMPTY_ELEMS: |
---|
378 | while self._open_tags: |
---|
379 | open_tag = self._open_tags.pop() |
---|
380 | self._enqueue(END, QName(open_tag)) |
---|
381 | if open_tag.lower() == tag.lower(): |
---|
382 | break |
---|
383 | |
---|
384 | def handle_data(self, text): |
---|
385 | if not isinstance(text, unicode): |
---|
386 | text = text.decode(self.encoding, 'replace') |
---|
387 | self._enqueue(TEXT, text) |
---|
388 | |
---|
389 | def handle_charref(self, name): |
---|
390 | if name.lower().startswith('x'): |
---|
391 | text = unichr(int(name[1:], 16)) |
---|
392 | else: |
---|
393 | text = unichr(int(name)) |
---|
394 | self._enqueue(TEXT, text) |
---|
395 | |
---|
396 | def handle_entityref(self, name): |
---|
397 | try: |
---|
398 | text = unichr(htmlentitydefs.name2codepoint[name]) |
---|
399 | except KeyError: |
---|
400 | text = '&%s;' % name |
---|
401 | self._enqueue(TEXT, text) |
---|
402 | |
---|
403 | def handle_pi(self, data): |
---|
404 | target, data = data.split(None, 1) |
---|
405 | if data.endswith('?'): |
---|
406 | data = data[:-1] |
---|
407 | self._enqueue(PI, (target.strip(), data.strip())) |
---|
408 | |
---|
409 | def handle_comment(self, text): |
---|
410 | self._enqueue(COMMENT, text) |
---|
411 | |
---|
412 | |
---|
413 | def HTML(text, encoding='utf-8'): |
---|
414 | """Parse the given HTML source and return a markup stream. |
---|
415 | |
---|
416 | Unlike with `HTMLParser`, the returned stream is reusable, meaning it can be |
---|
417 | iterated over multiple times: |
---|
418 | |
---|
419 | >>> html = HTML('<body><h1>Foo</h1></body>') |
---|
420 | >>> print html |
---|
421 | <body><h1>Foo</h1></body> |
---|
422 | >>> print html.select('h1') |
---|
423 | <h1>Foo</h1> |
---|
424 | >>> print html.select('h1/text()') |
---|
425 | Foo |
---|
426 | |
---|
427 | :param text: the HTML source |
---|
428 | :return: the parsed XML event stream |
---|
429 | :raises ParseError: if the HTML text is not well-formed, and error recovery |
---|
430 | fails |
---|
431 | """ |
---|
432 | return Stream(list(HTMLParser(StringIO(text), encoding=encoding))) |
---|
433 | |
---|
434 | def _coalesce(stream): |
---|
435 | """Coalesces adjacent TEXT events into a single event.""" |
---|
436 | textbuf = [] |
---|
437 | textpos = None |
---|
438 | for kind, data, pos in chain(stream, [(None, None, None)]): |
---|
439 | if kind is TEXT: |
---|
440 | textbuf.append(data) |
---|
441 | if textpos is None: |
---|
442 | textpos = pos |
---|
443 | else: |
---|
444 | if textbuf: |
---|
445 | yield TEXT, u''.join(textbuf), textpos |
---|
446 | del textbuf[:] |
---|
447 | textpos = None |
---|
448 | if kind: |
---|
449 | yield kind, data, pos |
---|