| 1 | # -*- coding: utf-8 -*- |
|---|
| 2 | # |
|---|
| 3 | # Copyright (C) 2006-2008 Edgewall Software |
|---|
| 4 | # All rights reserved. |
|---|
| 5 | # |
|---|
| 6 | # This software is licensed as described in the file COPYING, which |
|---|
| 7 | # you should have received as part of this distribution. The terms |
|---|
| 8 | # are also available at http://genshi.edgewall.org/wiki/License. |
|---|
| 9 | # |
|---|
| 10 | # This software consists of voluntary contributions made by many |
|---|
| 11 | # individuals. For the exact contribution history, see the revision |
|---|
| 12 | # history and logs, available at http://genshi.edgewall.org/log/. |
|---|
| 13 | |
|---|
| 14 | """Core classes for markup processing.""" |
|---|
| 15 | |
|---|
| 16 | try: |
|---|
| 17 | from functools import reduce |
|---|
| 18 | except ImportError: |
|---|
| 19 | pass # builtin in Python <= 2.5 |
|---|
| 20 | from itertools import chain |
|---|
| 21 | import operator |
|---|
| 22 | |
|---|
| 23 | from genshi.util import plaintext, stripentities, striptags |
|---|
| 24 | |
|---|
| 25 | __all__ = ['Stream', 'Markup', 'escape', 'unescape', 'Attrs', 'Namespace', |
|---|
| 26 | 'QName'] |
|---|
| 27 | __docformat__ = 'restructuredtext en' |
|---|
| 28 | |
|---|
| 29 | |
|---|
| 30 | class StreamEventKind(str): |
|---|
| 31 | """A kind of event on a markup stream.""" |
|---|
| 32 | __slots__ = [] |
|---|
| 33 | _instances = {} |
|---|
| 34 | |
|---|
| 35 | def __new__(cls, val): |
|---|
| 36 | return cls._instances.setdefault(val, str.__new__(cls, val)) |
|---|
| 37 | |
|---|
| 38 | |
|---|
| 39 | class Stream(object): |
|---|
| 40 | """Represents a stream of markup events. |
|---|
| 41 | |
|---|
| 42 | This class is basically an iterator over the events. |
|---|
| 43 | |
|---|
| 44 | Stream events are tuples of the form:: |
|---|
| 45 | |
|---|
| 46 | (kind, data, position) |
|---|
| 47 | |
|---|
| 48 | where ``kind`` is the event kind (such as `START`, `END`, `TEXT`, etc), |
|---|
| 49 | ``data`` depends on the kind of event, and ``position`` is a |
|---|
| 50 | ``(filename, line, offset)`` tuple that contains the location of the |
|---|
| 51 | original element or text in the input. If the original location is unknown, |
|---|
| 52 | ``position`` is ``(None, -1, -1)``. |
|---|
| 53 | |
|---|
| 54 | Also provided are ways to serialize the stream to text. The `serialize()` |
|---|
| 55 | method will return an iterator over generated strings, while `render()` |
|---|
| 56 | returns the complete generated text at once. Both accept various parameters |
|---|
| 57 | that impact the way the stream is serialized. |
|---|
| 58 | """ |
|---|
| 59 | __slots__ = ['events', 'serializer'] |
|---|
| 60 | |
|---|
| 61 | START = StreamEventKind('START') #: a start tag |
|---|
| 62 | END = StreamEventKind('END') #: an end tag |
|---|
| 63 | TEXT = StreamEventKind('TEXT') #: literal text |
|---|
| 64 | XML_DECL = StreamEventKind('XML_DECL') #: XML declaration |
|---|
| 65 | DOCTYPE = StreamEventKind('DOCTYPE') #: doctype declaration |
|---|
| 66 | START_NS = StreamEventKind('START_NS') #: start namespace mapping |
|---|
| 67 | END_NS = StreamEventKind('END_NS') #: end namespace mapping |
|---|
| 68 | START_CDATA = StreamEventKind('START_CDATA') #: start CDATA section |
|---|
| 69 | END_CDATA = StreamEventKind('END_CDATA') #: end CDATA section |
|---|
| 70 | PI = StreamEventKind('PI') #: processing instruction |
|---|
| 71 | COMMENT = StreamEventKind('COMMENT') #: comment |
|---|
| 72 | |
|---|
| 73 | def __init__(self, events, serializer=None): |
|---|
| 74 | """Initialize the stream with a sequence of markup events. |
|---|
| 75 | |
|---|
| 76 | :param events: a sequence or iterable providing the events |
|---|
| 77 | :param serializer: the default serialization method to use for this |
|---|
| 78 | stream |
|---|
| 79 | |
|---|
| 80 | :note: Changed in 0.5: added the `serializer` argument |
|---|
| 81 | """ |
|---|
| 82 | self.events = events #: The underlying iterable producing the events |
|---|
| 83 | self.serializer = serializer #: The default serializion method |
|---|
| 84 | |
|---|
| 85 | def __iter__(self): |
|---|
| 86 | return iter(self.events) |
|---|
| 87 | |
|---|
| 88 | def __or__(self, function): |
|---|
| 89 | """Override the "bitwise or" operator to apply filters or serializers |
|---|
| 90 | to the stream, providing a syntax similar to pipes on Unix shells. |
|---|
| 91 | |
|---|
| 92 | Assume the following stream produced by the `HTML` function: |
|---|
| 93 | |
|---|
| 94 | >>> from genshi.input import HTML |
|---|
| 95 | >>> html = HTML('''<p onclick="alert('Whoa')">Hello, world!</p>''') |
|---|
| 96 | >>> print html |
|---|
| 97 | <p onclick="alert('Whoa')">Hello, world!</p> |
|---|
| 98 | |
|---|
| 99 | A filter such as the HTML sanitizer can be applied to that stream using |
|---|
| 100 | the pipe notation as follows: |
|---|
| 101 | |
|---|
| 102 | >>> from genshi.filters import HTMLSanitizer |
|---|
| 103 | >>> sanitizer = HTMLSanitizer() |
|---|
| 104 | >>> print html | sanitizer |
|---|
| 105 | <p>Hello, world!</p> |
|---|
| 106 | |
|---|
| 107 | Filters can be any function that accepts and produces a stream (where |
|---|
| 108 | a stream is anything that iterates over events): |
|---|
| 109 | |
|---|
| 110 | >>> def uppercase(stream): |
|---|
| 111 | ... for kind, data, pos in stream: |
|---|
| 112 | ... if kind is TEXT: |
|---|
| 113 | ... data = data.upper() |
|---|
| 114 | ... yield kind, data, pos |
|---|
| 115 | >>> print html | sanitizer | uppercase |
|---|
| 116 | <p>HELLO, WORLD!</p> |
|---|
| 117 | |
|---|
| 118 | Serializers can also be used with this notation: |
|---|
| 119 | |
|---|
| 120 | >>> from genshi.output import TextSerializer |
|---|
| 121 | >>> output = TextSerializer() |
|---|
| 122 | >>> print html | sanitizer | uppercase | output |
|---|
| 123 | HELLO, WORLD! |
|---|
| 124 | |
|---|
| 125 | Commonly, serializers should be used at the end of the "pipeline"; |
|---|
| 126 | using them somewhere in the middle may produce unexpected results. |
|---|
| 127 | |
|---|
| 128 | :param function: the callable object that should be applied as a filter |
|---|
| 129 | :return: the filtered stream |
|---|
| 130 | :rtype: `Stream` |
|---|
| 131 | """ |
|---|
| 132 | return Stream(_ensure(function(self)), serializer=self.serializer) |
|---|
| 133 | |
|---|
| 134 | def filter(self, *filters): |
|---|
| 135 | """Apply filters to the stream. |
|---|
| 136 | |
|---|
| 137 | This method returns a new stream with the given filters applied. The |
|---|
| 138 | filters must be callables that accept the stream object as parameter, |
|---|
| 139 | and return the filtered stream. |
|---|
| 140 | |
|---|
| 141 | The call:: |
|---|
| 142 | |
|---|
| 143 | stream.filter(filter1, filter2) |
|---|
| 144 | |
|---|
| 145 | is equivalent to:: |
|---|
| 146 | |
|---|
| 147 | stream | filter1 | filter2 |
|---|
| 148 | |
|---|
| 149 | :param filters: one or more callable objects that should be applied as |
|---|
| 150 | filters |
|---|
| 151 | :return: the filtered stream |
|---|
| 152 | :rtype: `Stream` |
|---|
| 153 | """ |
|---|
| 154 | return reduce(operator.or_, (self,) + filters) |
|---|
| 155 | |
|---|
| 156 | def render(self, method=None, encoding='utf-8', out=None, **kwargs): |
|---|
| 157 | """Return a string representation of the stream. |
|---|
| 158 | |
|---|
| 159 | Any additional keyword arguments are passed to the serializer, and thus |
|---|
| 160 | depend on the `method` parameter value. |
|---|
| 161 | |
|---|
| 162 | :param method: determines how the stream is serialized; can be either |
|---|
| 163 | "xml", "xhtml", "html", "text", or a custom serializer |
|---|
| 164 | class; if `None`, the default serialization method of |
|---|
| 165 | the stream is used |
|---|
| 166 | :param encoding: how the output string should be encoded; if set to |
|---|
| 167 | `None`, this method returns a `unicode` object |
|---|
| 168 | :param out: a file-like object that the output should be written to |
|---|
| 169 | instead of being returned as one big string; note that if |
|---|
| 170 | this is a file or socket (or similar), the `encoding` must |
|---|
| 171 | not be `None` (that is, the output must be encoded) |
|---|
| 172 | :return: a `str` or `unicode` object (depending on the `encoding` |
|---|
| 173 | parameter), or `None` if the `out` parameter is provided |
|---|
| 174 | :rtype: `basestring` |
|---|
| 175 | |
|---|
| 176 | :see: XMLSerializer, XHTMLSerializer, HTMLSerializer, TextSerializer |
|---|
| 177 | :note: Changed in 0.5: added the `out` parameter |
|---|
| 178 | """ |
|---|
| 179 | from genshi.output import encode |
|---|
| 180 | if method is None: |
|---|
| 181 | method = self.serializer or 'xml' |
|---|
| 182 | generator = self.serialize(method=method, **kwargs) |
|---|
| 183 | return encode(generator, method=method, encoding=encoding, out=out) |
|---|
| 184 | |
|---|
| 185 | def select(self, path, namespaces=None, variables=None): |
|---|
| 186 | """Return a new stream that contains the events matching the given |
|---|
| 187 | XPath expression. |
|---|
| 188 | |
|---|
| 189 | >>> from genshi import HTML |
|---|
| 190 | >>> stream = HTML('<doc><elem>foo</elem><elem>bar</elem></doc>') |
|---|
| 191 | >>> print stream.select('elem') |
|---|
| 192 | <elem>foo</elem><elem>bar</elem> |
|---|
| 193 | >>> print stream.select('elem/text()') |
|---|
| 194 | foobar |
|---|
| 195 | |
|---|
| 196 | Note that the outermost element of the stream becomes the *context |
|---|
| 197 | node* for the XPath test. That means that the expression "doc" would |
|---|
| 198 | not match anything in the example above, because it only tests against |
|---|
| 199 | child elements of the outermost element: |
|---|
| 200 | |
|---|
| 201 | >>> print stream.select('doc') |
|---|
| 202 | <BLANKLINE> |
|---|
| 203 | |
|---|
| 204 | You can use the "." expression to match the context node itself |
|---|
| 205 | (although that usually makes little sense): |
|---|
| 206 | |
|---|
| 207 | >>> print stream.select('.') |
|---|
| 208 | <doc><elem>foo</elem><elem>bar</elem></doc> |
|---|
| 209 | |
|---|
| 210 | :param path: a string containing the XPath expression |
|---|
| 211 | :param namespaces: mapping of namespace prefixes used in the path |
|---|
| 212 | :param variables: mapping of variable names to values |
|---|
| 213 | :return: the selected substream |
|---|
| 214 | :rtype: `Stream` |
|---|
| 215 | :raises PathSyntaxError: if the given path expression is invalid or not |
|---|
| 216 | supported |
|---|
| 217 | """ |
|---|
| 218 | from genshi.path import Path |
|---|
| 219 | return Path(path).select(self, namespaces, variables) |
|---|
| 220 | |
|---|
| 221 | def serialize(self, method='xml', **kwargs): |
|---|
| 222 | """Generate strings corresponding to a specific serialization of the |
|---|
| 223 | stream. |
|---|
| 224 | |
|---|
| 225 | Unlike the `render()` method, this method is a generator that returns |
|---|
| 226 | the serialized output incrementally, as opposed to returning a single |
|---|
| 227 | string. |
|---|
| 228 | |
|---|
| 229 | Any additional keyword arguments are passed to the serializer, and thus |
|---|
| 230 | depend on the `method` parameter value. |
|---|
| 231 | |
|---|
| 232 | :param method: determines how the stream is serialized; can be either |
|---|
| 233 | "xml", "xhtml", "html", "text", or a custom serializer |
|---|
| 234 | class; if `None`, the default serialization method of |
|---|
| 235 | the stream is used |
|---|
| 236 | :return: an iterator over the serialization results (`Markup` or |
|---|
| 237 | `unicode` objects, depending on the serialization method) |
|---|
| 238 | :rtype: ``iterator`` |
|---|
| 239 | :see: XMLSerializer, XHTMLSerializer, HTMLSerializer, TextSerializer |
|---|
| 240 | """ |
|---|
| 241 | from genshi.output import get_serializer |
|---|
| 242 | if method is None: |
|---|
| 243 | method = self.serializer or 'xml' |
|---|
| 244 | return get_serializer(method, **kwargs)(_ensure(self)) |
|---|
| 245 | |
|---|
| 246 | def __str__(self): |
|---|
| 247 | return self.render() |
|---|
| 248 | |
|---|
| 249 | def __unicode__(self): |
|---|
| 250 | return self.render(encoding=None) |
|---|
| 251 | |
|---|
| 252 | def __html__(self): |
|---|
| 253 | return self |
|---|
| 254 | |
|---|
| 255 | |
|---|
| 256 | START = Stream.START |
|---|
| 257 | END = Stream.END |
|---|
| 258 | TEXT = Stream.TEXT |
|---|
| 259 | XML_DECL = Stream.XML_DECL |
|---|
| 260 | DOCTYPE = Stream.DOCTYPE |
|---|
| 261 | START_NS = Stream.START_NS |
|---|
| 262 | END_NS = Stream.END_NS |
|---|
| 263 | START_CDATA = Stream.START_CDATA |
|---|
| 264 | END_CDATA = Stream.END_CDATA |
|---|
| 265 | PI = Stream.PI |
|---|
| 266 | COMMENT = Stream.COMMENT |
|---|
| 267 | |
|---|
| 268 | def _ensure(stream): |
|---|
| 269 | """Ensure that every item on the stream is actually a markup event.""" |
|---|
| 270 | stream = iter(stream) |
|---|
| 271 | event = stream.next() |
|---|
| 272 | |
|---|
| 273 | # Check whether the iterable is a real markup event stream by examining the |
|---|
| 274 | # first item it yields; if it's not we'll need to do some conversion |
|---|
| 275 | if type(event) is not tuple or len(event) != 3: |
|---|
| 276 | for event in chain([event], stream): |
|---|
| 277 | if hasattr(event, 'totuple'): |
|---|
| 278 | event = event.totuple() |
|---|
| 279 | else: |
|---|
| 280 | event = TEXT, unicode(event), (None, -1, -1) |
|---|
| 281 | yield event |
|---|
| 282 | return |
|---|
| 283 | |
|---|
| 284 | # This looks like a markup event stream, so we'll just pass it through |
|---|
| 285 | # unchanged |
|---|
| 286 | yield event |
|---|
| 287 | for event in stream: |
|---|
| 288 | yield event |
|---|
| 289 | |
|---|
| 290 | |
|---|
| 291 | class Attrs(tuple): |
|---|
| 292 | """Immutable sequence type that stores the attributes of an element. |
|---|
| 293 | |
|---|
| 294 | Ordering of the attributes is preserved, while access by name is also |
|---|
| 295 | supported. |
|---|
| 296 | |
|---|
| 297 | >>> attrs = Attrs([('href', '#'), ('title', 'Foo')]) |
|---|
| 298 | >>> attrs |
|---|
| 299 | Attrs([('href', '#'), ('title', 'Foo')]) |
|---|
| 300 | |
|---|
| 301 | >>> 'href' in attrs |
|---|
| 302 | True |
|---|
| 303 | >>> 'tabindex' in attrs |
|---|
| 304 | False |
|---|
| 305 | >>> attrs.get('title') |
|---|
| 306 | 'Foo' |
|---|
| 307 | |
|---|
| 308 | Instances may not be manipulated directly. Instead, the operators ``|`` and |
|---|
| 309 | ``-`` can be used to produce new instances that have specific attributes |
|---|
| 310 | added, replaced or removed. |
|---|
| 311 | |
|---|
| 312 | To remove an attribute, use the ``-`` operator. The right hand side can be |
|---|
| 313 | either a string or a set/sequence of strings, identifying the name(s) of |
|---|
| 314 | the attribute(s) to remove: |
|---|
| 315 | |
|---|
| 316 | >>> attrs - 'title' |
|---|
| 317 | Attrs([('href', '#')]) |
|---|
| 318 | >>> attrs - ('title', 'href') |
|---|
| 319 | Attrs() |
|---|
| 320 | |
|---|
| 321 | The original instance is not modified, but the operator can of course be |
|---|
| 322 | used with an assignment: |
|---|
| 323 | |
|---|
| 324 | >>> attrs |
|---|
| 325 | Attrs([('href', '#'), ('title', 'Foo')]) |
|---|
| 326 | >>> attrs -= 'title' |
|---|
| 327 | >>> attrs |
|---|
| 328 | Attrs([('href', '#')]) |
|---|
| 329 | |
|---|
| 330 | To add a new attribute, use the ``|`` operator, where the right hand value |
|---|
| 331 | is a sequence of ``(name, value)`` tuples (which includes `Attrs` |
|---|
| 332 | instances): |
|---|
| 333 | |
|---|
| 334 | >>> attrs | [('title', 'Bar')] |
|---|
| 335 | Attrs([('href', '#'), ('title', 'Bar')]) |
|---|
| 336 | |
|---|
| 337 | If the attributes already contain an attribute with a given name, the value |
|---|
| 338 | of that attribute is replaced: |
|---|
| 339 | |
|---|
| 340 | >>> attrs | [('href', 'http://example.org/')] |
|---|
| 341 | Attrs([('href', 'http://example.org/')]) |
|---|
| 342 | """ |
|---|
| 343 | __slots__ = [] |
|---|
| 344 | |
|---|
| 345 | def __contains__(self, name): |
|---|
| 346 | """Return whether the list includes an attribute with the specified |
|---|
| 347 | name. |
|---|
| 348 | |
|---|
| 349 | :return: `True` if the list includes the attribute |
|---|
| 350 | :rtype: `bool` |
|---|
| 351 | """ |
|---|
| 352 | for attr, _ in self: |
|---|
| 353 | if attr == name: |
|---|
| 354 | return True |
|---|
| 355 | |
|---|
| 356 | def __getslice__(self, i, j): |
|---|
| 357 | """Return a slice of the attributes list. |
|---|
| 358 | |
|---|
| 359 | >>> attrs = Attrs([('href', '#'), ('title', 'Foo')]) |
|---|
| 360 | >>> attrs[1:] |
|---|
| 361 | Attrs([('title', 'Foo')]) |
|---|
| 362 | """ |
|---|
| 363 | return Attrs(tuple.__getslice__(self, i, j)) |
|---|
| 364 | |
|---|
| 365 | def __or__(self, attrs): |
|---|
| 366 | """Return a new instance that contains the attributes in `attrs` in |
|---|
| 367 | addition to any already existing attributes. |
|---|
| 368 | |
|---|
| 369 | :return: a new instance with the merged attributes |
|---|
| 370 | :rtype: `Attrs` |
|---|
| 371 | """ |
|---|
| 372 | repl = dict([(an, av) for an, av in attrs if an in self]) |
|---|
| 373 | return Attrs([(sn, repl.get(sn, sv)) for sn, sv in self] + |
|---|
| 374 | [(an, av) for an, av in attrs if an not in self]) |
|---|
| 375 | |
|---|
| 376 | def __repr__(self): |
|---|
| 377 | if not self: |
|---|
| 378 | return 'Attrs()' |
|---|
| 379 | return 'Attrs([%s])' % ', '.join([repr(item) for item in self]) |
|---|
| 380 | |
|---|
| 381 | def __sub__(self, names): |
|---|
| 382 | """Return a new instance with all attributes with a name in `names` are |
|---|
| 383 | removed. |
|---|
| 384 | |
|---|
| 385 | :param names: the names of the attributes to remove |
|---|
| 386 | :return: a new instance with the attribute removed |
|---|
| 387 | :rtype: `Attrs` |
|---|
| 388 | """ |
|---|
| 389 | if isinstance(names, basestring): |
|---|
| 390 | names = (names,) |
|---|
| 391 | return Attrs([(name, val) for name, val in self if name not in names]) |
|---|
| 392 | |
|---|
| 393 | def get(self, name, default=None): |
|---|
| 394 | """Return the value of the attribute with the specified name, or the |
|---|
| 395 | value of the `default` parameter if no such attribute is found. |
|---|
| 396 | |
|---|
| 397 | :param name: the name of the attribute |
|---|
| 398 | :param default: the value to return when the attribute does not exist |
|---|
| 399 | :return: the attribute value, or the `default` value if that attribute |
|---|
| 400 | does not exist |
|---|
| 401 | :rtype: `object` |
|---|
| 402 | """ |
|---|
| 403 | for attr, value in self: |
|---|
| 404 | if attr == name: |
|---|
| 405 | return value |
|---|
| 406 | return default |
|---|
| 407 | |
|---|
| 408 | def totuple(self): |
|---|
| 409 | """Return the attributes as a markup event. |
|---|
| 410 | |
|---|
| 411 | The returned event is a `TEXT` event, the data is the value of all |
|---|
| 412 | attributes joined together. |
|---|
| 413 | |
|---|
| 414 | >>> Attrs([('href', '#'), ('title', 'Foo')]).totuple() |
|---|
| 415 | ('TEXT', u'#Foo', (None, -1, -1)) |
|---|
| 416 | |
|---|
| 417 | :return: a `TEXT` event |
|---|
| 418 | :rtype: `tuple` |
|---|
| 419 | """ |
|---|
| 420 | return TEXT, u''.join([x[1] for x in self]), (None, -1, -1) |
|---|
| 421 | |
|---|
| 422 | |
|---|
| 423 | class Markup(unicode): |
|---|
| 424 | """Marks a string as being safe for inclusion in HTML/XML output without |
|---|
| 425 | needing to be escaped. |
|---|
| 426 | """ |
|---|
| 427 | __slots__ = [] |
|---|
| 428 | |
|---|
| 429 | def __add__(self, other): |
|---|
| 430 | return Markup(unicode(self) + unicode(escape(other))) |
|---|
| 431 | |
|---|
| 432 | def __radd__(self, other): |
|---|
| 433 | return Markup(unicode(escape(other)) + unicode(self)) |
|---|
| 434 | |
|---|
| 435 | def __mod__(self, args): |
|---|
| 436 | if isinstance(args, dict): |
|---|
| 437 | args = dict(zip(args.keys(), map(escape, args.values()))) |
|---|
| 438 | elif isinstance(args, (list, tuple)): |
|---|
| 439 | args = tuple(map(escape, args)) |
|---|
| 440 | else: |
|---|
| 441 | args = escape(args) |
|---|
| 442 | return Markup(unicode.__mod__(self, args)) |
|---|
| 443 | |
|---|
| 444 | def __mul__(self, num): |
|---|
| 445 | return Markup(unicode(self) * num) |
|---|
| 446 | |
|---|
| 447 | def __rmul__(self, num): |
|---|
| 448 | return Markup(num * unicode(self)) |
|---|
| 449 | |
|---|
| 450 | def __repr__(self): |
|---|
| 451 | return '<%s %r>' % (self.__class__.__name__, unicode(self)) |
|---|
| 452 | |
|---|
| 453 | def join(self, seq, escape_quotes=True): |
|---|
| 454 | """Return a `Markup` object which is the concatenation of the strings |
|---|
| 455 | in the given sequence, where this `Markup` object is the separator |
|---|
| 456 | between the joined elements. |
|---|
| 457 | |
|---|
| 458 | Any element in the sequence that is not a `Markup` instance is |
|---|
| 459 | automatically escaped. |
|---|
| 460 | |
|---|
| 461 | :param seq: the sequence of strings to join |
|---|
| 462 | :param escape_quotes: whether double quote characters in the elements |
|---|
| 463 | should be escaped |
|---|
| 464 | :return: the joined `Markup` object |
|---|
| 465 | :rtype: `Markup` |
|---|
| 466 | :see: `escape` |
|---|
| 467 | """ |
|---|
| 468 | return Markup(unicode(self).join([escape(item, quotes=escape_quotes) |
|---|
| 469 | for item in seq])) |
|---|
| 470 | |
|---|
| 471 | def escape(cls, text, quotes=True): |
|---|
| 472 | """Create a Markup instance from a string and escape special characters |
|---|
| 473 | it may contain (<, >, & and \"). |
|---|
| 474 | |
|---|
| 475 | >>> escape('"1 < 2"') |
|---|
| 476 | <Markup u'"1 < 2"'> |
|---|
| 477 | |
|---|
| 478 | If the `quotes` parameter is set to `False`, the \" character is left |
|---|
| 479 | as is. Escaping quotes is generally only required for strings that are |
|---|
| 480 | to be used in attribute values. |
|---|
| 481 | |
|---|
| 482 | >>> escape('"1 < 2"', quotes=False) |
|---|
| 483 | <Markup u'"1 < 2"'> |
|---|
| 484 | |
|---|
| 485 | :param text: the text to escape |
|---|
| 486 | :param quotes: if ``True``, double quote characters are escaped in |
|---|
| 487 | addition to the other special characters |
|---|
| 488 | :return: the escaped `Markup` string |
|---|
| 489 | :rtype: `Markup` |
|---|
| 490 | """ |
|---|
| 491 | if not text: |
|---|
| 492 | return cls() |
|---|
| 493 | if type(text) is cls: |
|---|
| 494 | return text |
|---|
| 495 | if hasattr(text, '__html__'): |
|---|
| 496 | return Markup(text.__html__()) |
|---|
| 497 | |
|---|
| 498 | text = unicode(text).replace('&', '&') \ |
|---|
| 499 | .replace('<', '<') \ |
|---|
| 500 | .replace('>', '>') |
|---|
| 501 | if quotes: |
|---|
| 502 | text = text.replace('"', '"') |
|---|
| 503 | return cls(text) |
|---|
| 504 | escape = classmethod(escape) |
|---|
| 505 | |
|---|
| 506 | def unescape(self): |
|---|
| 507 | """Reverse-escapes &, <, >, and \" and returns a `unicode` object. |
|---|
| 508 | |
|---|
| 509 | >>> Markup('1 < 2').unescape() |
|---|
| 510 | u'1 < 2' |
|---|
| 511 | |
|---|
| 512 | :return: the unescaped string |
|---|
| 513 | :rtype: `unicode` |
|---|
| 514 | :see: `genshi.core.unescape` |
|---|
| 515 | """ |
|---|
| 516 | if not self: |
|---|
| 517 | return u'' |
|---|
| 518 | return unicode(self).replace('"', '"') \ |
|---|
| 519 | .replace('>', '>') \ |
|---|
| 520 | .replace('<', '<') \ |
|---|
| 521 | .replace('&', '&') |
|---|
| 522 | |
|---|
| 523 | def stripentities(self, keepxmlentities=False): |
|---|
| 524 | """Return a copy of the text with any character or numeric entities |
|---|
| 525 | replaced by the equivalent UTF-8 characters. |
|---|
| 526 | |
|---|
| 527 | If the `keepxmlentities` parameter is provided and evaluates to `True`, |
|---|
| 528 | the core XML entities (``&``, ``'``, ``>``, ``<`` and |
|---|
| 529 | ``"``) are not stripped. |
|---|
| 530 | |
|---|
| 531 | :return: a `Markup` instance with entities removed |
|---|
| 532 | :rtype: `Markup` |
|---|
| 533 | :see: `genshi.util.stripentities` |
|---|
| 534 | """ |
|---|
| 535 | return Markup(stripentities(self, keepxmlentities=keepxmlentities)) |
|---|
| 536 | |
|---|
| 537 | def striptags(self): |
|---|
| 538 | """Return a copy of the text with all XML/HTML tags removed. |
|---|
| 539 | |
|---|
| 540 | :return: a `Markup` instance with all tags removed |
|---|
| 541 | :rtype: `Markup` |
|---|
| 542 | :see: `genshi.util.striptags` |
|---|
| 543 | """ |
|---|
| 544 | return Markup(striptags(self)) |
|---|
| 545 | |
|---|
| 546 | |
|---|
| 547 | try: |
|---|
| 548 | from genshi._speedups import Markup |
|---|
| 549 | except ImportError: |
|---|
| 550 | pass # just use the Python implementation |
|---|
| 551 | |
|---|
| 552 | escape = Markup.escape |
|---|
| 553 | |
|---|
| 554 | def unescape(text): |
|---|
| 555 | """Reverse-escapes &, <, >, and \" and returns a `unicode` object. |
|---|
| 556 | |
|---|
| 557 | >>> unescape(Markup('1 < 2')) |
|---|
| 558 | u'1 < 2' |
|---|
| 559 | |
|---|
| 560 | If the provided `text` object is not a `Markup` instance, it is returned |
|---|
| 561 | unchanged. |
|---|
| 562 | |
|---|
| 563 | >>> unescape('1 < 2') |
|---|
| 564 | '1 < 2' |
|---|
| 565 | |
|---|
| 566 | :param text: the text to unescape |
|---|
| 567 | :return: the unescsaped string |
|---|
| 568 | :rtype: `unicode` |
|---|
| 569 | """ |
|---|
| 570 | if not isinstance(text, Markup): |
|---|
| 571 | return text |
|---|
| 572 | return text.unescape() |
|---|
| 573 | |
|---|
| 574 | |
|---|
| 575 | class Namespace(object): |
|---|
| 576 | """Utility class creating and testing elements with a namespace. |
|---|
| 577 | |
|---|
| 578 | Internally, namespace URIs are encoded in the `QName` of any element or |
|---|
| 579 | attribute, the namespace URI being enclosed in curly braces. This class |
|---|
| 580 | helps create and test these strings. |
|---|
| 581 | |
|---|
| 582 | A `Namespace` object is instantiated with the namespace URI. |
|---|
| 583 | |
|---|
| 584 | >>> html = Namespace('http://www.w3.org/1999/xhtml') |
|---|
| 585 | >>> html |
|---|
| 586 | <Namespace "http://www.w3.org/1999/xhtml"> |
|---|
| 587 | >>> html.uri |
|---|
| 588 | u'http://www.w3.org/1999/xhtml' |
|---|
| 589 | |
|---|
| 590 | The `Namespace` object can than be used to generate `QName` objects with |
|---|
| 591 | that namespace: |
|---|
| 592 | |
|---|
| 593 | >>> html.body |
|---|
| 594 | QName(u'http://www.w3.org/1999/xhtml}body') |
|---|
| 595 | >>> html.body.localname |
|---|
| 596 | u'body' |
|---|
| 597 | >>> html.body.namespace |
|---|
| 598 | u'http://www.w3.org/1999/xhtml' |
|---|
| 599 | |
|---|
| 600 | The same works using item access notation, which is useful for element or |
|---|
| 601 | attribute names that are not valid Python identifiers: |
|---|
| 602 | |
|---|
| 603 | >>> html['body'] |
|---|
| 604 | QName(u'http://www.w3.org/1999/xhtml}body') |
|---|
| 605 | |
|---|
| 606 | A `Namespace` object can also be used to test whether a specific `QName` |
|---|
| 607 | belongs to that namespace using the ``in`` operator: |
|---|
| 608 | |
|---|
| 609 | >>> qname = html.body |
|---|
| 610 | >>> qname in html |
|---|
| 611 | True |
|---|
| 612 | >>> qname in Namespace('http://www.w3.org/2002/06/xhtml2') |
|---|
| 613 | False |
|---|
| 614 | """ |
|---|
| 615 | def __new__(cls, uri): |
|---|
| 616 | if type(uri) is cls: |
|---|
| 617 | return uri |
|---|
| 618 | return object.__new__(cls) |
|---|
| 619 | |
|---|
| 620 | def __getnewargs__(self): |
|---|
| 621 | return (self.uri,) |
|---|
| 622 | |
|---|
| 623 | def __getstate__(self): |
|---|
| 624 | return self.uri |
|---|
| 625 | |
|---|
| 626 | def __setstate__(self, uri): |
|---|
| 627 | self.uri = uri |
|---|
| 628 | |
|---|
| 629 | def __init__(self, uri): |
|---|
| 630 | self.uri = unicode(uri) |
|---|
| 631 | |
|---|
| 632 | def __contains__(self, qname): |
|---|
| 633 | return qname.namespace == self.uri |
|---|
| 634 | |
|---|
| 635 | def __ne__(self, other): |
|---|
| 636 | return not self == other |
|---|
| 637 | |
|---|
| 638 | def __eq__(self, other): |
|---|
| 639 | if isinstance(other, Namespace): |
|---|
| 640 | return self.uri == other.uri |
|---|
| 641 | return self.uri == other |
|---|
| 642 | |
|---|
| 643 | def __getitem__(self, name): |
|---|
| 644 | return QName(self.uri + u'}' + name) |
|---|
| 645 | __getattr__ = __getitem__ |
|---|
| 646 | |
|---|
| 647 | def __hash__(self): |
|---|
| 648 | return hash(self.uri) |
|---|
| 649 | |
|---|
| 650 | def __repr__(self): |
|---|
| 651 | return '<Namespace "%s">' % self.uri |
|---|
| 652 | |
|---|
| 653 | def __str__(self): |
|---|
| 654 | return self.uri.encode('utf-8') |
|---|
| 655 | |
|---|
| 656 | def __unicode__(self): |
|---|
| 657 | return self.uri |
|---|
| 658 | |
|---|
| 659 | |
|---|
| 660 | # The namespace used by attributes such as xml:lang and xml:space |
|---|
| 661 | XML_NAMESPACE = Namespace('http://www.w3.org/XML/1998/namespace') |
|---|
| 662 | |
|---|
| 663 | |
|---|
| 664 | class QName(unicode): |
|---|
| 665 | """A qualified element or attribute name. |
|---|
| 666 | |
|---|
| 667 | The unicode value of instances of this class contains the qualified name of |
|---|
| 668 | the element or attribute, in the form ``{namespace-uri}local-name``. The |
|---|
| 669 | namespace URI can be obtained through the additional `namespace` attribute, |
|---|
| 670 | while the local name can be accessed through the `localname` attribute. |
|---|
| 671 | |
|---|
| 672 | >>> qname = QName('foo') |
|---|
| 673 | >>> qname |
|---|
| 674 | QName(u'foo') |
|---|
| 675 | >>> qname.localname |
|---|
| 676 | u'foo' |
|---|
| 677 | >>> qname.namespace |
|---|
| 678 | |
|---|
| 679 | >>> qname = QName('http://www.w3.org/1999/xhtml}body') |
|---|
| 680 | >>> qname |
|---|
| 681 | QName(u'http://www.w3.org/1999/xhtml}body') |
|---|
| 682 | >>> qname.localname |
|---|
| 683 | u'body' |
|---|
| 684 | >>> qname.namespace |
|---|
| 685 | u'http://www.w3.org/1999/xhtml' |
|---|
| 686 | """ |
|---|
| 687 | __slots__ = ['namespace', 'localname'] |
|---|
| 688 | |
|---|
| 689 | def __new__(cls, qname): |
|---|
| 690 | """Create the `QName` instance. |
|---|
| 691 | |
|---|
| 692 | :param qname: the qualified name as a string of the form |
|---|
| 693 | ``{namespace-uri}local-name``, where the leading curly |
|---|
| 694 | brace is optional |
|---|
| 695 | """ |
|---|
| 696 | if type(qname) is cls: |
|---|
| 697 | return qname |
|---|
| 698 | |
|---|
| 699 | parts = qname.lstrip(u'{').split(u'}', 1) |
|---|
| 700 | if len(parts) > 1: |
|---|
| 701 | self = unicode.__new__(cls, u'{%s' % qname) |
|---|
| 702 | self.namespace, self.localname = map(unicode, parts) |
|---|
| 703 | else: |
|---|
| 704 | self = unicode.__new__(cls, qname) |
|---|
| 705 | self.namespace, self.localname = None, unicode(qname) |
|---|
| 706 | return self |
|---|
| 707 | |
|---|
| 708 | def __getnewargs__(self): |
|---|
| 709 | return (self.lstrip('{'),) |
|---|
| 710 | |
|---|
| 711 | def __repr__(self): |
|---|
| 712 | return 'QName(%s)' % unicode.__repr__(self.lstrip('{')) |
|---|