Edgewall Software

source: branches/stable/0.5.x/genshi/core.py

Last change on this file was 998, checked in by cmlenz, 15 years ago

Ported [914], [970], and [971] to 0.5.x branch.

  • Property svn:eol-style set to native
File size: 24.0 KB
Line 
1# -*- coding: utf-8 -*-
2#
3# Copyright (C) 2006-2008 Edgewall Software
4# All rights reserved.
5#
6# This software is licensed as described in the file COPYING, which
7# you should have received as part of this distribution. The terms
8# are also available at http://genshi.edgewall.org/wiki/License.
9#
10# This software consists of voluntary contributions made by many
11# individuals. For the exact contribution history, see the revision
12# history and logs, available at http://genshi.edgewall.org/log/.
13
14"""Core classes for markup processing."""
15
16try:
17    from functools import reduce
18except ImportError:
19    pass # builtin in Python <= 2.5
20from itertools import chain
21import operator
22
23from genshi.util import plaintext, stripentities, striptags
24
25__all__ = ['Stream', 'Markup', 'escape', 'unescape', 'Attrs', 'Namespace',
26           'QName']
27__docformat__ = 'restructuredtext en'
28
29
30class StreamEventKind(str):
31    """A kind of event on a markup stream."""
32    __slots__ = []
33    _instances = {}
34
35    def __new__(cls, val):
36        return cls._instances.setdefault(val, str.__new__(cls, val))
37
38
39class Stream(object):
40    """Represents a stream of markup events.
41   
42    This class is basically an iterator over the events.
43   
44    Stream events are tuples of the form::
45   
46      (kind, data, position)
47   
48    where ``kind`` is the event kind (such as `START`, `END`, `TEXT`, etc),
49    ``data`` depends on the kind of event, and ``position`` is a
50    ``(filename, line, offset)`` tuple that contains the location of the
51    original element or text in the input. If the original location is unknown,
52    ``position`` is ``(None, -1, -1)``.
53   
54    Also provided are ways to serialize the stream to text. The `serialize()`
55    method will return an iterator over generated strings, while `render()`
56    returns the complete generated text at once. Both accept various parameters
57    that impact the way the stream is serialized.
58    """
59    __slots__ = ['events', 'serializer']
60
61    START = StreamEventKind('START') #: a start tag
62    END = StreamEventKind('END') #: an end tag
63    TEXT = StreamEventKind('TEXT') #: literal text
64    XML_DECL = StreamEventKind('XML_DECL') #: XML declaration
65    DOCTYPE = StreamEventKind('DOCTYPE') #: doctype declaration
66    START_NS = StreamEventKind('START_NS') #: start namespace mapping
67    END_NS = StreamEventKind('END_NS') #: end namespace mapping
68    START_CDATA = StreamEventKind('START_CDATA') #: start CDATA section
69    END_CDATA = StreamEventKind('END_CDATA') #: end CDATA section
70    PI = StreamEventKind('PI') #: processing instruction
71    COMMENT = StreamEventKind('COMMENT') #: comment
72
73    def __init__(self, events, serializer=None):
74        """Initialize the stream with a sequence of markup events.
75       
76        :param events: a sequence or iterable providing the events
77        :param serializer: the default serialization method to use for this
78                           stream
79
80        :note: Changed in 0.5: added the `serializer` argument
81        """
82        self.events = events #: The underlying iterable producing the events
83        self.serializer = serializer #: The default serializion method
84
85    def __iter__(self):
86        return iter(self.events)
87
88    def __or__(self, function):
89        """Override the "bitwise or" operator to apply filters or serializers
90        to the stream, providing a syntax similar to pipes on Unix shells.
91       
92        Assume the following stream produced by the `HTML` function:
93       
94        >>> from genshi.input import HTML
95        >>> html = HTML('''<p onclick="alert('Whoa')">Hello, world!</p>''')
96        >>> print html
97        <p onclick="alert('Whoa')">Hello, world!</p>
98       
99        A filter such as the HTML sanitizer can be applied to that stream using
100        the pipe notation as follows:
101       
102        >>> from genshi.filters import HTMLSanitizer
103        >>> sanitizer = HTMLSanitizer()
104        >>> print html | sanitizer
105        <p>Hello, world!</p>
106       
107        Filters can be any function that accepts and produces a stream (where
108        a stream is anything that iterates over events):
109       
110        >>> def uppercase(stream):
111        ...     for kind, data, pos in stream:
112        ...         if kind is TEXT:
113        ...             data = data.upper()
114        ...         yield kind, data, pos
115        >>> print html | sanitizer | uppercase
116        <p>HELLO, WORLD!</p>
117       
118        Serializers can also be used with this notation:
119       
120        >>> from genshi.output import TextSerializer
121        >>> output = TextSerializer()
122        >>> print html | sanitizer | uppercase | output
123        HELLO, WORLD!
124       
125        Commonly, serializers should be used at the end of the "pipeline";
126        using them somewhere in the middle may produce unexpected results.
127       
128        :param function: the callable object that should be applied as a filter
129        :return: the filtered stream
130        :rtype: `Stream`
131        """
132        return Stream(_ensure(function(self)), serializer=self.serializer)
133
134    def filter(self, *filters):
135        """Apply filters to the stream.
136       
137        This method returns a new stream with the given filters applied. The
138        filters must be callables that accept the stream object as parameter,
139        and return the filtered stream.
140       
141        The call::
142       
143            stream.filter(filter1, filter2)
144       
145        is equivalent to::
146       
147            stream | filter1 | filter2
148       
149        :param filters: one or more callable objects that should be applied as
150                        filters
151        :return: the filtered stream
152        :rtype: `Stream`
153        """
154        return reduce(operator.or_, (self,) + filters)
155
156    def render(self, method=None, encoding='utf-8', out=None, **kwargs):
157        """Return a string representation of the stream.
158       
159        Any additional keyword arguments are passed to the serializer, and thus
160        depend on the `method` parameter value.
161       
162        :param method: determines how the stream is serialized; can be either
163                       "xml", "xhtml", "html", "text", or a custom serializer
164                       class; if `None`, the default serialization method of
165                       the stream is used
166        :param encoding: how the output string should be encoded; if set to
167                         `None`, this method returns a `unicode` object
168        :param out: a file-like object that the output should be written to
169                    instead of being returned as one big string; note that if
170                    this is a file or socket (or similar), the `encoding` must
171                    not be `None` (that is, the output must be encoded)
172        :return: a `str` or `unicode` object (depending on the `encoding`
173                 parameter), or `None` if the `out` parameter is provided
174        :rtype: `basestring`
175       
176        :see: XMLSerializer, XHTMLSerializer, HTMLSerializer, TextSerializer
177        :note: Changed in 0.5: added the `out` parameter
178        """
179        from genshi.output import encode
180        if method is None:
181            method = self.serializer or 'xml'
182        generator = self.serialize(method=method, **kwargs)
183        return encode(generator, method=method, encoding=encoding, out=out)
184
185    def select(self, path, namespaces=None, variables=None):
186        """Return a new stream that contains the events matching the given
187        XPath expression.
188       
189        >>> from genshi import HTML
190        >>> stream = HTML('<doc><elem>foo</elem><elem>bar</elem></doc>')
191        >>> print stream.select('elem')
192        <elem>foo</elem><elem>bar</elem>
193        >>> print stream.select('elem/text()')
194        foobar
195       
196        Note that the outermost element of the stream becomes the *context
197        node* for the XPath test. That means that the expression "doc" would
198        not match anything in the example above, because it only tests against
199        child elements of the outermost element:
200       
201        >>> print stream.select('doc')
202        <BLANKLINE>
203       
204        You can use the "." expression to match the context node itself
205        (although that usually makes little sense):
206       
207        >>> print stream.select('.')
208        <doc><elem>foo</elem><elem>bar</elem></doc>
209       
210        :param path: a string containing the XPath expression
211        :param namespaces: mapping of namespace prefixes used in the path
212        :param variables: mapping of variable names to values
213        :return: the selected substream
214        :rtype: `Stream`
215        :raises PathSyntaxError: if the given path expression is invalid or not
216                                 supported
217        """
218        from genshi.path import Path
219        return Path(path).select(self, namespaces, variables)
220
221    def serialize(self, method='xml', **kwargs):
222        """Generate strings corresponding to a specific serialization of the
223        stream.
224       
225        Unlike the `render()` method, this method is a generator that returns
226        the serialized output incrementally, as opposed to returning a single
227        string.
228       
229        Any additional keyword arguments are passed to the serializer, and thus
230        depend on the `method` parameter value.
231       
232        :param method: determines how the stream is serialized; can be either
233                       "xml", "xhtml", "html", "text", or a custom serializer
234                       class; if `None`, the default serialization method of
235                       the stream is used
236        :return: an iterator over the serialization results (`Markup` or
237                 `unicode` objects, depending on the serialization method)
238        :rtype: ``iterator``
239        :see: XMLSerializer, XHTMLSerializer, HTMLSerializer, TextSerializer
240        """
241        from genshi.output import get_serializer
242        if method is None:
243            method = self.serializer or 'xml'
244        return get_serializer(method, **kwargs)(_ensure(self))
245
246    def __str__(self):
247        return self.render()
248
249    def __unicode__(self):
250        return self.render(encoding=None)
251
252    def __html__(self):
253        return self
254
255
256START = Stream.START
257END = Stream.END
258TEXT = Stream.TEXT
259XML_DECL = Stream.XML_DECL
260DOCTYPE = Stream.DOCTYPE
261START_NS = Stream.START_NS
262END_NS = Stream.END_NS
263START_CDATA = Stream.START_CDATA
264END_CDATA = Stream.END_CDATA
265PI = Stream.PI
266COMMENT = Stream.COMMENT
267
268def _ensure(stream):
269    """Ensure that every item on the stream is actually a markup event."""
270    stream = iter(stream)
271    event = stream.next()
272
273    # Check whether the iterable is a real markup event stream by examining the
274    # first item it yields; if it's not we'll need to do some conversion
275    if type(event) is not tuple or len(event) != 3:
276        for event in chain([event], stream):
277            if hasattr(event, 'totuple'):
278                event = event.totuple()
279            else:
280                event = TEXT, unicode(event), (None, -1, -1)
281            yield event
282        return
283
284    # This looks like a markup event stream, so we'll just pass it through
285    # unchanged
286    yield event
287    for event in stream:
288        yield event
289
290
291class Attrs(tuple):
292    """Immutable sequence type that stores the attributes of an element.
293   
294    Ordering of the attributes is preserved, while access by name is also
295    supported.
296   
297    >>> attrs = Attrs([('href', '#'), ('title', 'Foo')])
298    >>> attrs
299    Attrs([('href', '#'), ('title', 'Foo')])
300   
301    >>> 'href' in attrs
302    True
303    >>> 'tabindex' in attrs
304    False
305    >>> attrs.get('title')
306    'Foo'
307   
308    Instances may not be manipulated directly. Instead, the operators ``|`` and
309    ``-`` can be used to produce new instances that have specific attributes
310    added, replaced or removed.
311   
312    To remove an attribute, use the ``-`` operator. The right hand side can be
313    either a string or a set/sequence of strings, identifying the name(s) of
314    the attribute(s) to remove:
315   
316    >>> attrs - 'title'
317    Attrs([('href', '#')])
318    >>> attrs - ('title', 'href')
319    Attrs()
320   
321    The original instance is not modified, but the operator can of course be
322    used with an assignment:
323
324    >>> attrs
325    Attrs([('href', '#'), ('title', 'Foo')])
326    >>> attrs -= 'title'
327    >>> attrs
328    Attrs([('href', '#')])
329   
330    To add a new attribute, use the ``|`` operator, where the right hand value
331    is a sequence of ``(name, value)`` tuples (which includes `Attrs`
332    instances):
333   
334    >>> attrs | [('title', 'Bar')]
335    Attrs([('href', '#'), ('title', 'Bar')])
336   
337    If the attributes already contain an attribute with a given name, the value
338    of that attribute is replaced:
339   
340    >>> attrs | [('href', 'http://example.org/')]
341    Attrs([('href', 'http://example.org/')])
342    """
343    __slots__ = []
344
345    def __contains__(self, name):
346        """Return whether the list includes an attribute with the specified
347        name.
348       
349        :return: `True` if the list includes the attribute
350        :rtype: `bool`
351        """
352        for attr, _ in self:
353            if attr == name:
354                return True
355
356    def __getslice__(self, i, j):
357        """Return a slice of the attributes list.
358       
359        >>> attrs = Attrs([('href', '#'), ('title', 'Foo')])
360        >>> attrs[1:]
361        Attrs([('title', 'Foo')])
362        """
363        return Attrs(tuple.__getslice__(self, i, j))
364
365    def __or__(self, attrs):
366        """Return a new instance that contains the attributes in `attrs` in
367        addition to any already existing attributes.
368       
369        :return: a new instance with the merged attributes
370        :rtype: `Attrs`
371        """
372        repl = dict([(an, av) for an, av in attrs if an in self])
373        return Attrs([(sn, repl.get(sn, sv)) for sn, sv in self] +
374                     [(an, av) for an, av in attrs if an not in self])
375
376    def __repr__(self):
377        if not self:
378            return 'Attrs()'
379        return 'Attrs([%s])' % ', '.join([repr(item) for item in self])
380
381    def __sub__(self, names):
382        """Return a new instance with all attributes with a name in `names` are
383        removed.
384       
385        :param names: the names of the attributes to remove
386        :return: a new instance with the attribute removed
387        :rtype: `Attrs`
388        """
389        if isinstance(names, basestring):
390            names = (names,)
391        return Attrs([(name, val) for name, val in self if name not in names])
392
393    def get(self, name, default=None):
394        """Return the value of the attribute with the specified name, or the
395        value of the `default` parameter if no such attribute is found.
396       
397        :param name: the name of the attribute
398        :param default: the value to return when the attribute does not exist
399        :return: the attribute value, or the `default` value if that attribute
400                 does not exist
401        :rtype: `object`
402        """
403        for attr, value in self:
404            if attr == name:
405                return value
406        return default
407
408    def totuple(self):
409        """Return the attributes as a markup event.
410       
411        The returned event is a `TEXT` event, the data is the value of all
412        attributes joined together.
413       
414        >>> Attrs([('href', '#'), ('title', 'Foo')]).totuple()
415        ('TEXT', u'#Foo', (None, -1, -1))
416       
417        :return: a `TEXT` event
418        :rtype: `tuple`
419        """
420        return TEXT, u''.join([x[1] for x in self]), (None, -1, -1)
421
422
423class Markup(unicode):
424    """Marks a string as being safe for inclusion in HTML/XML output without
425    needing to be escaped.
426    """
427    __slots__ = []
428
429    def __add__(self, other):
430        return Markup(unicode(self) + unicode(escape(other)))
431
432    def __radd__(self, other):
433        return Markup(unicode(escape(other)) + unicode(self))
434
435    def __mod__(self, args):
436        if isinstance(args, dict):
437            args = dict(zip(args.keys(), map(escape, args.values())))
438        elif isinstance(args, (list, tuple)):
439            args = tuple(map(escape, args))
440        else:
441            args = escape(args)
442        return Markup(unicode.__mod__(self, args))
443
444    def __mul__(self, num):
445        return Markup(unicode(self) * num)
446
447    def __rmul__(self, num):
448        return Markup(num * unicode(self))
449
450    def __repr__(self):
451        return '<%s %r>' % (self.__class__.__name__, unicode(self))
452
453    def join(self, seq, escape_quotes=True):
454        """Return a `Markup` object which is the concatenation of the strings
455        in the given sequence, where this `Markup` object is the separator
456        between the joined elements.
457       
458        Any element in the sequence that is not a `Markup` instance is
459        automatically escaped.
460       
461        :param seq: the sequence of strings to join
462        :param escape_quotes: whether double quote characters in the elements
463                              should be escaped
464        :return: the joined `Markup` object
465        :rtype: `Markup`
466        :see: `escape`
467        """
468        return Markup(unicode(self).join([escape(item, quotes=escape_quotes)
469                                          for item in seq]))
470
471    def escape(cls, text, quotes=True):
472        """Create a Markup instance from a string and escape special characters
473        it may contain (<, >, & and \").
474       
475        >>> escape('"1 < 2"')
476        <Markup u'&#34;1 &lt; 2&#34;'>
477       
478        If the `quotes` parameter is set to `False`, the \" character is left
479        as is. Escaping quotes is generally only required for strings that are
480        to be used in attribute values.
481       
482        >>> escape('"1 < 2"', quotes=False)
483        <Markup u'"1 &lt; 2"'>
484       
485        :param text: the text to escape
486        :param quotes: if ``True``, double quote characters are escaped in
487                       addition to the other special characters
488        :return: the escaped `Markup` string
489        :rtype: `Markup`
490        """
491        if not text:
492            return cls()
493        if type(text) is cls:
494            return text
495        if hasattr(text, '__html__'):
496            return Markup(text.__html__())
497
498        text = unicode(text).replace('&', '&amp;') \
499                            .replace('<', '&lt;') \
500                            .replace('>', '&gt;')
501        if quotes:
502            text = text.replace('"', '&#34;')
503        return cls(text)
504    escape = classmethod(escape)
505
506    def unescape(self):
507        """Reverse-escapes &, <, >, and \" and returns a `unicode` object.
508       
509        >>> Markup('1 &lt; 2').unescape()
510        u'1 < 2'
511       
512        :return: the unescaped string
513        :rtype: `unicode`
514        :see: `genshi.core.unescape`
515        """
516        if not self:
517            return u''
518        return unicode(self).replace('&#34;', '"') \
519                            .replace('&gt;', '>') \
520                            .replace('&lt;', '<') \
521                            .replace('&amp;', '&')
522
523    def stripentities(self, keepxmlentities=False):
524        """Return a copy of the text with any character or numeric entities
525        replaced by the equivalent UTF-8 characters.
526       
527        If the `keepxmlentities` parameter is provided and evaluates to `True`,
528        the core XML entities (``&amp;``, ``&apos;``, ``&gt;``, ``&lt;`` and
529        ``&quot;``) are not stripped.
530       
531        :return: a `Markup` instance with entities removed
532        :rtype: `Markup`
533        :see: `genshi.util.stripentities`
534        """
535        return Markup(stripentities(self, keepxmlentities=keepxmlentities))
536
537    def striptags(self):
538        """Return a copy of the text with all XML/HTML tags removed.
539       
540        :return: a `Markup` instance with all tags removed
541        :rtype: `Markup`
542        :see: `genshi.util.striptags`
543        """
544        return Markup(striptags(self))
545
546
547try:
548    from genshi._speedups import Markup
549except ImportError:
550    pass # just use the Python implementation
551
552escape = Markup.escape
553
554def unescape(text):
555    """Reverse-escapes &, <, >, and \" and returns a `unicode` object.
556   
557    >>> unescape(Markup('1 &lt; 2'))
558    u'1 < 2'
559   
560    If the provided `text` object is not a `Markup` instance, it is returned
561    unchanged.
562   
563    >>> unescape('1 &lt; 2')
564    '1 &lt; 2'
565   
566    :param text: the text to unescape
567    :return: the unescsaped string
568    :rtype: `unicode`
569    """
570    if not isinstance(text, Markup):
571        return text
572    return text.unescape()
573
574
575class Namespace(object):
576    """Utility class creating and testing elements with a namespace.
577   
578    Internally, namespace URIs are encoded in the `QName` of any element or
579    attribute, the namespace URI being enclosed in curly braces. This class
580    helps create and test these strings.
581   
582    A `Namespace` object is instantiated with the namespace URI.
583   
584    >>> html = Namespace('http://www.w3.org/1999/xhtml')
585    >>> html
586    <Namespace "http://www.w3.org/1999/xhtml">
587    >>> html.uri
588    u'http://www.w3.org/1999/xhtml'
589   
590    The `Namespace` object can than be used to generate `QName` objects with
591    that namespace:
592   
593    >>> html.body
594    QName(u'http://www.w3.org/1999/xhtml}body')
595    >>> html.body.localname
596    u'body'
597    >>> html.body.namespace
598    u'http://www.w3.org/1999/xhtml'
599   
600    The same works using item access notation, which is useful for element or
601    attribute names that are not valid Python identifiers:
602   
603    >>> html['body']
604    QName(u'http://www.w3.org/1999/xhtml}body')
605   
606    A `Namespace` object can also be used to test whether a specific `QName`
607    belongs to that namespace using the ``in`` operator:
608   
609    >>> qname = html.body
610    >>> qname in html
611    True
612    >>> qname in Namespace('http://www.w3.org/2002/06/xhtml2')
613    False
614    """
615    def __new__(cls, uri):
616        if type(uri) is cls:
617            return uri
618        return object.__new__(cls)
619
620    def __getnewargs__(self):
621        return (self.uri,)
622
623    def __getstate__(self):
624        return self.uri
625
626    def __setstate__(self, uri):
627        self.uri = uri
628
629    def __init__(self, uri):
630        self.uri = unicode(uri)
631
632    def __contains__(self, qname):
633        return qname.namespace == self.uri
634
635    def __ne__(self, other):
636        return not self == other
637
638    def __eq__(self, other):
639        if isinstance(other, Namespace):
640            return self.uri == other.uri
641        return self.uri == other
642
643    def __getitem__(self, name):
644        return QName(self.uri + u'}' + name)
645    __getattr__ = __getitem__
646
647    def __hash__(self):
648        return hash(self.uri)
649
650    def __repr__(self):
651        return '<Namespace "%s">' % self.uri
652
653    def __str__(self):
654        return self.uri.encode('utf-8')
655
656    def __unicode__(self):
657        return self.uri
658
659
660# The namespace used by attributes such as xml:lang and xml:space
661XML_NAMESPACE = Namespace('http://www.w3.org/XML/1998/namespace')
662
663
664class QName(unicode):
665    """A qualified element or attribute name.
666   
667    The unicode value of instances of this class contains the qualified name of
668    the element or attribute, in the form ``{namespace-uri}local-name``. The
669    namespace URI can be obtained through the additional `namespace` attribute,
670    while the local name can be accessed through the `localname` attribute.
671   
672    >>> qname = QName('foo')
673    >>> qname
674    QName(u'foo')
675    >>> qname.localname
676    u'foo'
677    >>> qname.namespace
678   
679    >>> qname = QName('http://www.w3.org/1999/xhtml}body')
680    >>> qname
681    QName(u'http://www.w3.org/1999/xhtml}body')
682    >>> qname.localname
683    u'body'
684    >>> qname.namespace
685    u'http://www.w3.org/1999/xhtml'
686    """
687    __slots__ = ['namespace', 'localname']
688
689    def __new__(cls, qname):
690        """Create the `QName` instance.
691       
692        :param qname: the qualified name as a string of the form
693                      ``{namespace-uri}local-name``, where the leading curly
694                      brace is optional
695        """
696        if type(qname) is cls:
697            return qname
698
699        parts = qname.lstrip(u'{').split(u'}', 1)
700        if len(parts) > 1:
701            self = unicode.__new__(cls, u'{%s' % qname)
702            self.namespace, self.localname = map(unicode, parts)
703        else:
704            self = unicode.__new__(cls, qname)
705            self.namespace, self.localname = None, unicode(qname)
706        return self
707
708    def __getnewargs__(self):
709        return (self.lstrip('{'),)
710
711    def __repr__(self):
712        return 'QName(%s)' % unicode.__repr__(self.lstrip('{'))
Note: See TracBrowser for help on using the repository browser.