Edgewall Software

source: branches/stable/0.6.x/genshi/core.py

Last change on this file was 1151, checked in by hodgestar, 13 years ago

Merge r1148 from trunk (fix for qname equality issue #413).

  • Property svn:eol-style set to native
File size: 24.4 KB
Line 
1# -*- coding: utf-8 -*-
2#
3# Copyright (C) 2006-2009 Edgewall Software
4# All rights reserved.
5#
6# This software is licensed as described in the file COPYING, which
7# you should have received as part of this distribution. The terms
8# are also available at http://genshi.edgewall.org/wiki/License.
9#
10# This software consists of voluntary contributions made by many
11# individuals. For the exact contribution history, see the revision
12# history and logs, available at http://genshi.edgewall.org/log/.
13
14"""Core classes for markup processing."""
15
16try:
17    reduce # builtin in Python < 3
18except NameError:
19    from functools import reduce
20from itertools import chain
21import operator
22
23from genshi.util import plaintext, stripentities, striptags, stringrepr
24
25__all__ = ['Stream', 'Markup', 'escape', 'unescape', 'Attrs', 'Namespace',
26           'QName']
27__docformat__ = 'restructuredtext en'
28
29
30class StreamEventKind(str):
31    """A kind of event on a markup stream."""
32    __slots__ = []
33    _instances = {}
34
35    def __new__(cls, val):
36        return cls._instances.setdefault(val, str.__new__(cls, val))
37
38
39class Stream(object):
40    """Represents a stream of markup events.
41   
42    This class is basically an iterator over the events.
43   
44    Stream events are tuples of the form::
45   
46      (kind, data, position)
47   
48    where ``kind`` is the event kind (such as `START`, `END`, `TEXT`, etc),
49    ``data`` depends on the kind of event, and ``position`` is a
50    ``(filename, line, offset)`` tuple that contains the location of the
51    original element or text in the input. If the original location is unknown,
52    ``position`` is ``(None, -1, -1)``.
53   
54    Also provided are ways to serialize the stream to text. The `serialize()`
55    method will return an iterator over generated strings, while `render()`
56    returns the complete generated text at once. Both accept various parameters
57    that impact the way the stream is serialized.
58    """
59    __slots__ = ['events', 'serializer']
60
61    START = StreamEventKind('START') #: a start tag
62    END = StreamEventKind('END') #: an end tag
63    TEXT = StreamEventKind('TEXT') #: literal text
64    XML_DECL = StreamEventKind('XML_DECL') #: XML declaration
65    DOCTYPE = StreamEventKind('DOCTYPE') #: doctype declaration
66    START_NS = StreamEventKind('START_NS') #: start namespace mapping
67    END_NS = StreamEventKind('END_NS') #: end namespace mapping
68    START_CDATA = StreamEventKind('START_CDATA') #: start CDATA section
69    END_CDATA = StreamEventKind('END_CDATA') #: end CDATA section
70    PI = StreamEventKind('PI') #: processing instruction
71    COMMENT = StreamEventKind('COMMENT') #: comment
72
73    def __init__(self, events, serializer=None):
74        """Initialize the stream with a sequence of markup events.
75       
76        :param events: a sequence or iterable providing the events
77        :param serializer: the default serialization method to use for this
78                           stream
79
80        :note: Changed in 0.5: added the `serializer` argument
81        """
82        self.events = events #: The underlying iterable producing the events
83        self.serializer = serializer #: The default serializion method
84
85    def __iter__(self):
86        return iter(self.events)
87
88    def __or__(self, function):
89        """Override the "bitwise or" operator to apply filters or serializers
90        to the stream, providing a syntax similar to pipes on Unix shells.
91       
92        Assume the following stream produced by the `HTML` function:
93       
94        >>> from genshi.input import HTML
95        >>> html = HTML('''<p onclick="alert('Whoa')">Hello, world!</p>''')
96        >>> print(html)
97        <p onclick="alert('Whoa')">Hello, world!</p>
98       
99        A filter such as the HTML sanitizer can be applied to that stream using
100        the pipe notation as follows:
101       
102        >>> from genshi.filters import HTMLSanitizer
103        >>> sanitizer = HTMLSanitizer()
104        >>> print(html | sanitizer)
105        <p>Hello, world!</p>
106       
107        Filters can be any function that accepts and produces a stream (where
108        a stream is anything that iterates over events):
109       
110        >>> def uppercase(stream):
111        ...     for kind, data, pos in stream:
112        ...         if kind is TEXT:
113        ...             data = data.upper()
114        ...         yield kind, data, pos
115        >>> print(html | sanitizer | uppercase)
116        <p>HELLO, WORLD!</p>
117       
118        Serializers can also be used with this notation:
119       
120        >>> from genshi.output import TextSerializer
121        >>> output = TextSerializer()
122        >>> print(html | sanitizer | uppercase | output)
123        HELLO, WORLD!
124       
125        Commonly, serializers should be used at the end of the "pipeline";
126        using them somewhere in the middle may produce unexpected results.
127       
128        :param function: the callable object that should be applied as a filter
129        :return: the filtered stream
130        :rtype: `Stream`
131        """
132        return Stream(_ensure(function(self)), serializer=self.serializer)
133
134    def filter(self, *filters):
135        """Apply filters to the stream.
136       
137        This method returns a new stream with the given filters applied. The
138        filters must be callables that accept the stream object as parameter,
139        and return the filtered stream.
140       
141        The call::
142       
143            stream.filter(filter1, filter2)
144       
145        is equivalent to::
146       
147            stream | filter1 | filter2
148       
149        :param filters: one or more callable objects that should be applied as
150                        filters
151        :return: the filtered stream
152        :rtype: `Stream`
153        """
154        return reduce(operator.or_, (self,) + filters)
155
156    def render(self, method=None, encoding='utf-8', out=None, **kwargs):
157        """Return a string representation of the stream.
158       
159        Any additional keyword arguments are passed to the serializer, and thus
160        depend on the `method` parameter value.
161       
162        :param method: determines how the stream is serialized; can be either
163                       "xml", "xhtml", "html", "text", or a custom serializer
164                       class; if `None`, the default serialization method of
165                       the stream is used
166        :param encoding: how the output string should be encoded; if set to
167                         `None`, this method returns a `unicode` object
168        :param out: a file-like object that the output should be written to
169                    instead of being returned as one big string; note that if
170                    this is a file or socket (or similar), the `encoding` must
171                    not be `None` (that is, the output must be encoded)
172        :return: a `str` or `unicode` object (depending on the `encoding`
173                 parameter), or `None` if the `out` parameter is provided
174        :rtype: `basestring`
175       
176        :see: XMLSerializer, XHTMLSerializer, HTMLSerializer, TextSerializer
177        :note: Changed in 0.5: added the `out` parameter
178        """
179        from genshi.output import encode
180        if method is None:
181            method = self.serializer or 'xml'
182        generator = self.serialize(method=method, **kwargs)
183        return encode(generator, method=method, encoding=encoding, out=out)
184
185    def select(self, path, namespaces=None, variables=None):
186        """Return a new stream that contains the events matching the given
187        XPath expression.
188       
189        >>> from genshi import HTML
190        >>> stream = HTML('<doc><elem>foo</elem><elem>bar</elem></doc>')
191        >>> print(stream.select('elem'))
192        <elem>foo</elem><elem>bar</elem>
193        >>> print(stream.select('elem/text()'))
194        foobar
195       
196        Note that the outermost element of the stream becomes the *context
197        node* for the XPath test. That means that the expression "doc" would
198        not match anything in the example above, because it only tests against
199        child elements of the outermost element:
200       
201        >>> print(stream.select('doc'))
202        <BLANKLINE>
203       
204        You can use the "." expression to match the context node itself
205        (although that usually makes little sense):
206       
207        >>> print(stream.select('.'))
208        <doc><elem>foo</elem><elem>bar</elem></doc>
209       
210        :param path: a string containing the XPath expression
211        :param namespaces: mapping of namespace prefixes used in the path
212        :param variables: mapping of variable names to values
213        :return: the selected substream
214        :rtype: `Stream`
215        :raises PathSyntaxError: if the given path expression is invalid or not
216                                 supported
217        """
218        from genshi.path import Path
219        return Path(path).select(self, namespaces, variables)
220
221    def serialize(self, method='xml', **kwargs):
222        """Generate strings corresponding to a specific serialization of the
223        stream.
224       
225        Unlike the `render()` method, this method is a generator that returns
226        the serialized output incrementally, as opposed to returning a single
227        string.
228       
229        Any additional keyword arguments are passed to the serializer, and thus
230        depend on the `method` parameter value.
231       
232        :param method: determines how the stream is serialized; can be either
233                       "xml", "xhtml", "html", "text", or a custom serializer
234                       class; if `None`, the default serialization method of
235                       the stream is used
236        :return: an iterator over the serialization results (`Markup` or
237                 `unicode` objects, depending on the serialization method)
238        :rtype: ``iterator``
239        :see: XMLSerializer, XHTMLSerializer, HTMLSerializer, TextSerializer
240        """
241        from genshi.output import get_serializer
242        if method is None:
243            method = self.serializer or 'xml'
244        return get_serializer(method, **kwargs)(_ensure(self))
245
246    def __str__(self):
247        return self.render()
248
249    def __unicode__(self):
250        return self.render(encoding=None)
251
252    def __html__(self):
253        return self
254
255
256START = Stream.START
257END = Stream.END
258TEXT = Stream.TEXT
259XML_DECL = Stream.XML_DECL
260DOCTYPE = Stream.DOCTYPE
261START_NS = Stream.START_NS
262END_NS = Stream.END_NS
263START_CDATA = Stream.START_CDATA
264END_CDATA = Stream.END_CDATA
265PI = Stream.PI
266COMMENT = Stream.COMMENT
267
268
269def _ensure(stream):
270    """Ensure that every item on the stream is actually a markup event."""
271    stream = iter(stream)
272    event = stream.next()
273
274    # Check whether the iterable is a real markup event stream by examining the
275    # first item it yields; if it's not we'll need to do some conversion
276    if type(event) is not tuple or len(event) != 3:
277        for event in chain([event], stream):
278            if hasattr(event, 'totuple'):
279                event = event.totuple()
280            else:
281                event = TEXT, unicode(event), (None, -1, -1)
282            yield event
283        return
284
285    # This looks like a markup event stream, so we'll just pass it through
286    # unchanged
287    yield event
288    for event in stream:
289        yield event
290
291
292class Attrs(tuple):
293    """Immutable sequence type that stores the attributes of an element.
294   
295    Ordering of the attributes is preserved, while access by name is also
296    supported.
297   
298    >>> attrs = Attrs([('href', '#'), ('title', 'Foo')])
299    >>> attrs
300    Attrs([('href', '#'), ('title', 'Foo')])
301   
302    >>> 'href' in attrs
303    True
304    >>> 'tabindex' in attrs
305    False
306    >>> attrs.get('title')
307    'Foo'
308   
309    Instances may not be manipulated directly. Instead, the operators ``|`` and
310    ``-`` can be used to produce new instances that have specific attributes
311    added, replaced or removed.
312   
313    To remove an attribute, use the ``-`` operator. The right hand side can be
314    either a string or a set/sequence of strings, identifying the name(s) of
315    the attribute(s) to remove:
316   
317    >>> attrs - 'title'
318    Attrs([('href', '#')])
319    >>> attrs - ('title', 'href')
320    Attrs()
321   
322    The original instance is not modified, but the operator can of course be
323    used with an assignment:
324
325    >>> attrs
326    Attrs([('href', '#'), ('title', 'Foo')])
327    >>> attrs -= 'title'
328    >>> attrs
329    Attrs([('href', '#')])
330   
331    To add a new attribute, use the ``|`` operator, where the right hand value
332    is a sequence of ``(name, value)`` tuples (which includes `Attrs`
333    instances):
334   
335    >>> attrs | [('title', 'Bar')]
336    Attrs([('href', '#'), ('title', 'Bar')])
337   
338    If the attributes already contain an attribute with a given name, the value
339    of that attribute is replaced:
340   
341    >>> attrs | [('href', 'http://example.org/')]
342    Attrs([('href', 'http://example.org/')])
343    """
344    __slots__ = []
345
346    def __contains__(self, name):
347        """Return whether the list includes an attribute with the specified
348        name.
349       
350        :return: `True` if the list includes the attribute
351        :rtype: `bool`
352        """
353        for attr, _ in self:
354            if attr == name:
355                return True
356
357    def __getitem__(self, i):
358        """Return an item or slice of the attributes list.
359       
360        >>> attrs = Attrs([('href', '#'), ('title', 'Foo')])
361        >>> attrs[1]
362        ('title', 'Foo')
363        >>> attrs[1:]
364        Attrs([('title', 'Foo')])
365        """
366        items = tuple.__getitem__(self, i)
367        if type(i) is slice:
368            return Attrs(items)
369        return items
370
371    def __getslice__(self, i, j):
372        """Return a slice of the attributes list.
373       
374        >>> attrs = Attrs([('href', '#'), ('title', 'Foo')])
375        >>> attrs[1:]
376        Attrs([('title', 'Foo')])
377        """
378        return Attrs(tuple.__getslice__(self, i, j))
379
380    def __or__(self, attrs):
381        """Return a new instance that contains the attributes in `attrs` in
382        addition to any already existing attributes.
383       
384        :return: a new instance with the merged attributes
385        :rtype: `Attrs`
386        """
387        repl = dict([(an, av) for an, av in attrs if an in self])
388        return Attrs([(sn, repl.get(sn, sv)) for sn, sv in self] +
389                     [(an, av) for an, av in attrs if an not in self])
390
391    def __repr__(self):
392        if not self:
393            return 'Attrs()'
394        return 'Attrs([%s])' % ', '.join([repr(item) for item in self])
395
396    def __sub__(self, names):
397        """Return a new instance with all attributes with a name in `names` are
398        removed.
399       
400        :param names: the names of the attributes to remove
401        :return: a new instance with the attribute removed
402        :rtype: `Attrs`
403        """
404        if isinstance(names, basestring):
405            names = (names,)
406        return Attrs([(name, val) for name, val in self if name not in names])
407
408    def get(self, name, default=None):
409        """Return the value of the attribute with the specified name, or the
410        value of the `default` parameter if no such attribute is found.
411       
412        :param name: the name of the attribute
413        :param default: the value to return when the attribute does not exist
414        :return: the attribute value, or the `default` value if that attribute
415                 does not exist
416        :rtype: `object`
417        """
418        for attr, value in self:
419            if attr == name:
420                return value
421        return default
422
423    def totuple(self):
424        """Return the attributes as a markup event.
425       
426        The returned event is a `TEXT` event, the data is the value of all
427        attributes joined together.
428       
429        >>> Attrs([('href', '#'), ('title', 'Foo')]).totuple()
430        ('TEXT', '#Foo', (None, -1, -1))
431       
432        :return: a `TEXT` event
433        :rtype: `tuple`
434        """
435        return TEXT, ''.join([x[1] for x in self]), (None, -1, -1)
436
437
438class Markup(unicode):
439    """Marks a string as being safe for inclusion in HTML/XML output without
440    needing to be escaped.
441    """
442    __slots__ = []
443
444    def __add__(self, other):
445        return Markup(unicode.__add__(self, escape(other)))
446
447    def __radd__(self, other):
448        return Markup(unicode.__add__(escape(other), self))
449
450    def __mod__(self, args):
451        if isinstance(args, dict):
452            args = dict(zip(args.keys(), map(escape, args.values())))
453        elif isinstance(args, (list, tuple)):
454            args = tuple(map(escape, args))
455        else:
456            args = escape(args)
457        return Markup(unicode.__mod__(self, args))
458
459    def __mul__(self, num):
460        return Markup(unicode.__mul__(self, num))
461    __rmul__ = __mul__
462
463    def __repr__(self):
464        return "<%s %s>" % (type(self).__name__, unicode.__repr__(self))
465
466    def join(self, seq, escape_quotes=True):
467        """Return a `Markup` object which is the concatenation of the strings
468        in the given sequence, where this `Markup` object is the separator
469        between the joined elements.
470       
471        Any element in the sequence that is not a `Markup` instance is
472        automatically escaped.
473       
474        :param seq: the sequence of strings to join
475        :param escape_quotes: whether double quote characters in the elements
476                              should be escaped
477        :return: the joined `Markup` object
478        :rtype: `Markup`
479        :see: `escape`
480        """
481        return Markup(unicode.join(self, [escape(item, quotes=escape_quotes)
482                                          for item in seq]))
483
484    @classmethod
485    def escape(cls, text, quotes=True):
486        """Create a Markup instance from a string and escape special characters
487        it may contain (<, >, & and \").
488       
489        >>> escape('"1 < 2"')
490        <Markup u'&#34;1 &lt; 2&#34;'>
491       
492        If the `quotes` parameter is set to `False`, the \" character is left
493        as is. Escaping quotes is generally only required for strings that are
494        to be used in attribute values.
495       
496        >>> escape('"1 < 2"', quotes=False)
497        <Markup u'"1 &lt; 2"'>
498       
499        :param text: the text to escape
500        :param quotes: if ``True``, double quote characters are escaped in
501                       addition to the other special characters
502        :return: the escaped `Markup` string
503        :rtype: `Markup`
504        """
505        if not text:
506            return cls()
507        if type(text) is cls:
508            return text
509        if hasattr(text, '__html__'):
510            return Markup(text.__html__())
511
512        text = text.replace('&', '&amp;') \
513                   .replace('<', '&lt;') \
514                   .replace('>', '&gt;')
515        if quotes:
516            text = text.replace('"', '&#34;')
517        return cls(text)
518
519    def unescape(self):
520        """Reverse-escapes &, <, >, and \" and returns a `unicode` object.
521       
522        >>> Markup('1 &lt; 2').unescape()
523        u'1 < 2'
524       
525        :return: the unescaped string
526        :rtype: `unicode`
527        :see: `genshi.core.unescape`
528        """
529        if not self:
530            return ''
531        return unicode(self).replace('&#34;', '"') \
532                            .replace('&gt;', '>') \
533                            .replace('&lt;', '<') \
534                            .replace('&amp;', '&')
535
536    def stripentities(self, keepxmlentities=False):
537        """Return a copy of the text with any character or numeric entities
538        replaced by the equivalent UTF-8 characters.
539       
540        If the `keepxmlentities` parameter is provided and evaluates to `True`,
541        the core XML entities (``&amp;``, ``&apos;``, ``&gt;``, ``&lt;`` and
542        ``&quot;``) are not stripped.
543       
544        :return: a `Markup` instance with entities removed
545        :rtype: `Markup`
546        :see: `genshi.util.stripentities`
547        """
548        return Markup(stripentities(self, keepxmlentities=keepxmlentities))
549
550    def striptags(self):
551        """Return a copy of the text with all XML/HTML tags removed.
552       
553        :return: a `Markup` instance with all tags removed
554        :rtype: `Markup`
555        :see: `genshi.util.striptags`
556        """
557        return Markup(striptags(self))
558
559
560try:
561    from genshi._speedups import Markup
562except ImportError:
563    pass # just use the Python implementation
564
565
566escape = Markup.escape
567
568
569def unescape(text):
570    """Reverse-escapes &, <, >, and \" and returns a `unicode` object.
571   
572    >>> unescape(Markup('1 &lt; 2'))
573    u'1 < 2'
574   
575    If the provided `text` object is not a `Markup` instance, it is returned
576    unchanged.
577   
578    >>> unescape('1 &lt; 2')
579    '1 &lt; 2'
580   
581    :param text: the text to unescape
582    :return: the unescsaped string
583    :rtype: `unicode`
584    """
585    if not isinstance(text, Markup):
586        return text
587    return text.unescape()
588
589
590class Namespace(object):
591    """Utility class creating and testing elements with a namespace.
592   
593    Internally, namespace URIs are encoded in the `QName` of any element or
594    attribute, the namespace URI being enclosed in curly braces. This class
595    helps create and test these strings.
596   
597    A `Namespace` object is instantiated with the namespace URI.
598   
599    >>> html = Namespace('http://www.w3.org/1999/xhtml')
600    >>> html
601    Namespace('http://www.w3.org/1999/xhtml')
602    >>> html.uri
603    u'http://www.w3.org/1999/xhtml'
604   
605    The `Namespace` object can than be used to generate `QName` objects with
606    that namespace:
607   
608    >>> html.body
609    QName('http://www.w3.org/1999/xhtml}body')
610    >>> html.body.localname
611    u'body'
612    >>> html.body.namespace
613    u'http://www.w3.org/1999/xhtml'
614   
615    The same works using item access notation, which is useful for element or
616    attribute names that are not valid Python identifiers:
617   
618    >>> html['body']
619    QName('http://www.w3.org/1999/xhtml}body')
620   
621    A `Namespace` object can also be used to test whether a specific `QName`
622    belongs to that namespace using the ``in`` operator:
623   
624    >>> qname = html.body
625    >>> qname in html
626    True
627    >>> qname in Namespace('http://www.w3.org/2002/06/xhtml2')
628    False
629    """
630    def __new__(cls, uri):
631        if type(uri) is cls:
632            return uri
633        return object.__new__(cls)
634
635    def __getnewargs__(self):
636        return (self.uri,)
637
638    def __getstate__(self):
639        return self.uri
640
641    def __setstate__(self, uri):
642        self.uri = uri
643
644    def __init__(self, uri):
645        self.uri = unicode(uri)
646
647    def __contains__(self, qname):
648        return qname.namespace == self.uri
649
650    def __ne__(self, other):
651        return not self == other
652
653    def __eq__(self, other):
654        if isinstance(other, Namespace):
655            return self.uri == other.uri
656        return self.uri == other
657
658    def __getitem__(self, name):
659        return QName(self.uri + '}' + name)
660    __getattr__ = __getitem__
661
662    def __hash__(self):
663        return hash(self.uri)
664
665    def __repr__(self):
666        return '%s(%s)' % (type(self).__name__, stringrepr(self.uri))
667
668    def __str__(self):
669        return self.uri.encode('utf-8')
670
671    def __unicode__(self):
672        return self.uri
673
674
675# The namespace used by attributes such as xml:lang and xml:space
676XML_NAMESPACE = Namespace('http://www.w3.org/XML/1998/namespace')
677
678
679class QName(unicode):
680    """A qualified element or attribute name.
681   
682    The unicode value of instances of this class contains the qualified name of
683    the element or attribute, in the form ``{namespace-uri}local-name``. The
684    namespace URI can be obtained through the additional `namespace` attribute,
685    while the local name can be accessed through the `localname` attribute.
686   
687    >>> qname = QName('foo')
688    >>> qname
689    QName('foo')
690    >>> qname.localname
691    u'foo'
692    >>> qname.namespace
693   
694    >>> qname = QName('http://www.w3.org/1999/xhtml}body')
695    >>> qname
696    QName('http://www.w3.org/1999/xhtml}body')
697    >>> qname.localname
698    u'body'
699    >>> qname.namespace
700    u'http://www.w3.org/1999/xhtml'
701    """
702    __slots__ = ['namespace', 'localname']
703
704    def __new__(cls, qname):
705        """Create the `QName` instance.
706       
707        :param qname: the qualified name as a string of the form
708                      ``{namespace-uri}local-name``, where the leading curly
709                      brace is optional
710        """
711        if type(qname) is cls:
712            return qname
713
714        qname = qname.lstrip('{')
715        parts = qname.split('}', 1)
716        if len(parts) > 1:
717            self = unicode.__new__(cls, '{%s' % qname)
718            self.namespace, self.localname = map(unicode, parts)
719        else:
720            self = unicode.__new__(cls, qname)
721            self.namespace, self.localname = None, unicode(qname)
722        return self
723
724    def __getnewargs__(self):
725        return (self.lstrip('{'),)
726
727    def __repr__(self):
728        return '%s(%s)' % (type(self).__name__, stringrepr(self.lstrip('{')))
Note: See TracBrowser for help on using the repository browser.