Edgewall Software

source: trunk/genshi/core.py

Last change on this file was 1251, checked in by hodgestar, 10 years ago

Revert r1245 (the Python3.4 regression was fixed in http://hg.python.org/cpython/rev/b328f8ccbccf).

  • Property svn:eol-style set to native
File size: 25.1 KB
Line 
1# -*- coding: utf-8 -*-
2#
3# Copyright (C) 2006-2009 Edgewall Software
4# All rights reserved.
5#
6# This software is licensed as described in the file COPYING, which
7# you should have received as part of this distribution. The terms
8# are also available at http://genshi.edgewall.org/wiki/License.
9#
10# This software consists of voluntary contributions made by many
11# individuals. For the exact contribution history, see the revision
12# history and logs, available at http://genshi.edgewall.org/log/.
13
14"""Core classes for markup processing."""
15
16try:
17    reduce # builtin in Python < 3
18except NameError:
19    from functools import reduce
20import sys
21from itertools import chain
22import operator
23
24from genshi.util import plaintext, stripentities, striptags, stringrepr
25
26__all__ = ['Stream', 'Markup', 'escape', 'unescape', 'Attrs', 'Namespace',
27           'QName']
28__docformat__ = 'restructuredtext en'
29
30
31class StreamEventKind(str):
32    """A kind of event on a markup stream."""
33    __slots__ = []
34    _instances = {}
35
36    def __new__(cls, val):
37        return cls._instances.setdefault(val, str.__new__(cls, val))
38
39
40class Stream(object):
41    """Represents a stream of markup events.
42   
43    This class is basically an iterator over the events.
44   
45    Stream events are tuples of the form::
46   
47      (kind, data, position)
48   
49    where ``kind`` is the event kind (such as `START`, `END`, `TEXT`, etc),
50    ``data`` depends on the kind of event, and ``position`` is a
51    ``(filename, line, offset)`` tuple that contains the location of the
52    original element or text in the input. If the original location is unknown,
53    ``position`` is ``(None, -1, -1)``.
54   
55    Also provided are ways to serialize the stream to text. The `serialize()`
56    method will return an iterator over generated strings, while `render()`
57    returns the complete generated text at once. Both accept various parameters
58    that impact the way the stream is serialized.
59    """
60    __slots__ = ['events', 'serializer']
61
62    START = StreamEventKind('START') #: a start tag
63    END = StreamEventKind('END') #: an end tag
64    TEXT = StreamEventKind('TEXT') #: literal text
65    XML_DECL = StreamEventKind('XML_DECL') #: XML declaration
66    DOCTYPE = StreamEventKind('DOCTYPE') #: doctype declaration
67    START_NS = StreamEventKind('START_NS') #: start namespace mapping
68    END_NS = StreamEventKind('END_NS') #: end namespace mapping
69    START_CDATA = StreamEventKind('START_CDATA') #: start CDATA section
70    END_CDATA = StreamEventKind('END_CDATA') #: end CDATA section
71    PI = StreamEventKind('PI') #: processing instruction
72    COMMENT = StreamEventKind('COMMENT') #: comment
73
74    def __init__(self, events, serializer=None):
75        """Initialize the stream with a sequence of markup events.
76       
77        :param events: a sequence or iterable providing the events
78        :param serializer: the default serialization method to use for this
79                           stream
80
81        :note: Changed in 0.5: added the `serializer` argument
82        """
83        self.events = events #: The underlying iterable producing the events
84        self.serializer = serializer #: The default serializion method
85
86    def __iter__(self):
87        return iter(self.events)
88
89    def __or__(self, function):
90        """Override the "bitwise or" operator to apply filters or serializers
91        to the stream, providing a syntax similar to pipes on Unix shells.
92       
93        Assume the following stream produced by the `HTML` function:
94       
95        >>> from genshi.input import HTML
96        >>> html = HTML('''<p onclick="alert('Whoa')">Hello, world!</p>''', encoding='utf-8')
97        >>> print(html)
98        <p onclick="alert('Whoa')">Hello, world!</p>
99       
100        A filter such as the HTML sanitizer can be applied to that stream using
101        the pipe notation as follows:
102       
103        >>> from genshi.filters import HTMLSanitizer
104        >>> sanitizer = HTMLSanitizer()
105        >>> print(html | sanitizer)
106        <p>Hello, world!</p>
107       
108        Filters can be any function that accepts and produces a stream (where
109        a stream is anything that iterates over events):
110       
111        >>> def uppercase(stream):
112        ...     for kind, data, pos in stream:
113        ...         if kind is TEXT:
114        ...             data = data.upper()
115        ...         yield kind, data, pos
116        >>> print(html | sanitizer | uppercase)
117        <p>HELLO, WORLD!</p>
118       
119        Serializers can also be used with this notation:
120       
121        >>> from genshi.output import TextSerializer
122        >>> output = TextSerializer()
123        >>> print(html | sanitizer | uppercase | output)
124        HELLO, WORLD!
125       
126        Commonly, serializers should be used at the end of the "pipeline";
127        using them somewhere in the middle may produce unexpected results.
128       
129        :param function: the callable object that should be applied as a filter
130        :return: the filtered stream
131        :rtype: `Stream`
132        """
133        return Stream(_ensure(function(self)), serializer=self.serializer)
134
135    def filter(self, *filters):
136        """Apply filters to the stream.
137       
138        This method returns a new stream with the given filters applied. The
139        filters must be callables that accept the stream object as parameter,
140        and return the filtered stream.
141       
142        The call::
143       
144            stream.filter(filter1, filter2)
145       
146        is equivalent to::
147       
148            stream | filter1 | filter2
149       
150        :param filters: one or more callable objects that should be applied as
151                        filters
152        :return: the filtered stream
153        :rtype: `Stream`
154        """
155        return reduce(operator.or_, (self,) + filters)
156
157    def render(self, method=None, encoding=None, out=None, **kwargs):
158        """Return a string representation of the stream.
159       
160        Any additional keyword arguments are passed to the serializer, and thus
161        depend on the `method` parameter value.
162       
163        :param method: determines how the stream is serialized; can be either
164                       "xml", "xhtml", "html", "text", or a custom serializer
165                       class; if `None`, the default serialization method of
166                       the stream is used
167        :param encoding: how the output string should be encoded; if set to
168                         `None`, this method returns a `unicode` object
169        :param out: a file-like object that the output should be written to
170                    instead of being returned as one big string; note that if
171                    this is a file or socket (or similar), the `encoding` must
172                    not be `None` (that is, the output must be encoded)
173        :return: a `str` or `unicode` object (depending on the `encoding`
174                 parameter), or `None` if the `out` parameter is provided
175        :rtype: `basestring`
176       
177        :see: XMLSerializer, XHTMLSerializer, HTMLSerializer, TextSerializer
178        :note: Changed in 0.5: added the `out` parameter
179        """
180        from genshi.output import encode
181        if method is None:
182            method = self.serializer or 'xml'
183        generator = self.serialize(method=method, **kwargs)
184        return encode(generator, method=method, encoding=encoding, out=out)
185
186    def select(self, path, namespaces=None, variables=None):
187        """Return a new stream that contains the events matching the given
188        XPath expression.
189       
190        >>> from genshi import HTML
191        >>> stream = HTML('<doc><elem>foo</elem><elem>bar</elem></doc>', encoding='utf-8')
192        >>> print(stream.select('elem'))
193        <elem>foo</elem><elem>bar</elem>
194        >>> print(stream.select('elem/text()'))
195        foobar
196       
197        Note that the outermost element of the stream becomes the *context
198        node* for the XPath test. That means that the expression "doc" would
199        not match anything in the example above, because it only tests against
200        child elements of the outermost element:
201       
202        >>> print(stream.select('doc'))
203        <BLANKLINE>
204       
205        You can use the "." expression to match the context node itself
206        (although that usually makes little sense):
207       
208        >>> print(stream.select('.'))
209        <doc><elem>foo</elem><elem>bar</elem></doc>
210       
211        :param path: a string containing the XPath expression
212        :param namespaces: mapping of namespace prefixes used in the path
213        :param variables: mapping of variable names to values
214        :return: the selected substream
215        :rtype: `Stream`
216        :raises PathSyntaxError: if the given path expression is invalid or not
217                                 supported
218        """
219        from genshi.path import Path
220        return Path(path).select(self, namespaces, variables)
221
222    def serialize(self, method='xml', **kwargs):
223        """Generate strings corresponding to a specific serialization of the
224        stream.
225       
226        Unlike the `render()` method, this method is a generator that returns
227        the serialized output incrementally, as opposed to returning a single
228        string.
229       
230        Any additional keyword arguments are passed to the serializer, and thus
231        depend on the `method` parameter value.
232       
233        :param method: determines how the stream is serialized; can be either
234                       "xml", "xhtml", "html", "text", or a custom serializer
235                       class; if `None`, the default serialization method of
236                       the stream is used
237        :return: an iterator over the serialization results (`Markup` or
238                 `unicode` objects, depending on the serialization method)
239        :rtype: ``iterator``
240        :see: XMLSerializer, XHTMLSerializer, HTMLSerializer, TextSerializer
241        """
242        from genshi.output import get_serializer
243        if method is None:
244            method = self.serializer or 'xml'
245        return get_serializer(method, **kwargs)(_ensure(self))
246
247    def __str__(self):
248        return self.render()
249
250    def __unicode__(self):
251        return self.render(encoding=None)
252
253    def __html__(self):
254        return self
255
256
257START = Stream.START
258END = Stream.END
259TEXT = Stream.TEXT
260XML_DECL = Stream.XML_DECL
261DOCTYPE = Stream.DOCTYPE
262START_NS = Stream.START_NS
263END_NS = Stream.END_NS
264START_CDATA = Stream.START_CDATA
265END_CDATA = Stream.END_CDATA
266PI = Stream.PI
267COMMENT = Stream.COMMENT
268
269
270def _ensure(stream):
271    """Ensure that every item on the stream is actually a markup event."""
272    stream = iter(stream)
273    event = stream.next()
274
275    # Check whether the iterable is a real markup event stream by examining the
276    # first item it yields; if it's not we'll need to do some conversion
277    if type(event) is not tuple or len(event) != 3:
278        for event in chain([event], stream):
279            if hasattr(event, 'totuple'):
280                event = event.totuple()
281            else:
282                event = TEXT, unicode(event), (None, -1, -1)
283            yield event
284        return
285
286    # This looks like a markup event stream, so we'll just pass it through
287    # unchanged
288    yield event
289    for event in stream:
290        yield event
291
292
293class Attrs(tuple):
294    """Immutable sequence type that stores the attributes of an element.
295   
296    Ordering of the attributes is preserved, while access by name is also
297    supported.
298   
299    >>> attrs = Attrs([('href', '#'), ('title', 'Foo')])
300    >>> attrs
301    Attrs([('href', '#'), ('title', 'Foo')])
302   
303    >>> 'href' in attrs
304    True
305    >>> 'tabindex' in attrs
306    False
307    >>> attrs.get('title')
308    'Foo'
309   
310    Instances may not be manipulated directly. Instead, the operators ``|`` and
311    ``-`` can be used to produce new instances that have specific attributes
312    added, replaced or removed.
313   
314    To remove an attribute, use the ``-`` operator. The right hand side can be
315    either a string or a set/sequence of strings, identifying the name(s) of
316    the attribute(s) to remove:
317   
318    >>> attrs - 'title'
319    Attrs([('href', '#')])
320    >>> attrs - ('title', 'href')
321    Attrs()
322   
323    The original instance is not modified, but the operator can of course be
324    used with an assignment:
325
326    >>> attrs
327    Attrs([('href', '#'), ('title', 'Foo')])
328    >>> attrs -= 'title'
329    >>> attrs
330    Attrs([('href', '#')])
331   
332    To add a new attribute, use the ``|`` operator, where the right hand value
333    is a sequence of ``(name, value)`` tuples (which includes `Attrs`
334    instances):
335   
336    >>> attrs | [('title', 'Bar')]
337    Attrs([('href', '#'), ('title', 'Bar')])
338   
339    If the attributes already contain an attribute with a given name, the value
340    of that attribute is replaced:
341   
342    >>> attrs | [('href', 'http://example.org/')]
343    Attrs([('href', 'http://example.org/')])
344    """
345    __slots__ = []
346
347    def __contains__(self, name):
348        """Return whether the list includes an attribute with the specified
349        name.
350       
351        :return: `True` if the list includes the attribute
352        :rtype: `bool`
353        """
354        for attr, _ in self:
355            if attr == name:
356                return True
357        return False
358
359    def __getitem__(self, i):
360        """Return an item or slice of the attributes list.
361       
362        >>> attrs = Attrs([('href', '#'), ('title', 'Foo')])
363        >>> attrs[1]
364        ('title', 'Foo')
365        >>> attrs[1:]
366        Attrs([('title', 'Foo')])
367        """
368        items = tuple.__getitem__(self, i)
369        if type(i) is slice:
370            return Attrs(items)
371        return items
372
373    def __getslice__(self, i, j):
374        """Return a slice of the attributes list.
375       
376        >>> attrs = Attrs([('href', '#'), ('title', 'Foo')])
377        >>> attrs[1:]
378        Attrs([('title', 'Foo')])
379        """
380        return Attrs(tuple.__getslice__(self, i, j))
381
382    def __or__(self, attrs):
383        """Return a new instance that contains the attributes in `attrs` in
384        addition to any already existing attributes. Any attributes in the new
385        set that have a value of `None` are removed.
386       
387        :return: a new instance with the merged attributes
388        :rtype: `Attrs`
389        """
390        remove = set([an for an, av in attrs if av is None])
391        replace = dict([(an, av) for an, av in attrs
392                        if an in self and av is not None])
393        return Attrs([(sn, replace.get(sn, sv)) for sn, sv in self
394                      if sn not in remove] +
395                     [(an, av) for an, av in attrs
396                      if an not in self and an not in remove])
397
398    def __repr__(self):
399        if not self:
400            return 'Attrs()'
401        return 'Attrs([%s])' % ', '.join([repr(item) for item in self])
402
403    def __sub__(self, names):
404        """Return a new instance with all attributes with a name in `names` are
405        removed.
406       
407        :param names: the names of the attributes to remove
408        :return: a new instance with the attribute removed
409        :rtype: `Attrs`
410        """
411        if isinstance(names, basestring):
412            names = (names,)
413        return Attrs([(name, val) for name, val in self if name not in names])
414
415    def get(self, name, default=None):
416        """Return the value of the attribute with the specified name, or the
417        value of the `default` parameter if no such attribute is found.
418       
419        :param name: the name of the attribute
420        :param default: the value to return when the attribute does not exist
421        :return: the attribute value, or the `default` value if that attribute
422                 does not exist
423        :rtype: `object`
424        """
425        for attr, value in self:
426            if attr == name:
427                return value
428        return default
429
430    def totuple(self):
431        """Return the attributes as a markup event.
432       
433        The returned event is a `TEXT` event, the data is the value of all
434        attributes joined together.
435       
436        >>> Attrs([('href', '#'), ('title', 'Foo')]).totuple()
437        ('TEXT', '#Foo', (None, -1, -1))
438       
439        :return: a `TEXT` event
440        :rtype: `tuple`
441        """
442        return TEXT, ''.join([x[1] for x in self]), (None, -1, -1)
443
444
445class Markup(unicode):
446    """Marks a string as being safe for inclusion in HTML/XML output without
447    needing to be escaped.
448    """
449    __slots__ = []
450
451    def __add__(self, other):
452        return Markup(unicode.__add__(self, escape(other)))
453
454    def __radd__(self, other):
455        return Markup(unicode.__add__(escape(other), self))
456
457    def __mod__(self, args):
458        if isinstance(args, dict):
459            args = dict(zip(args.keys(), map(escape, args.values())))
460        elif isinstance(args, (list, tuple)):
461            args = tuple(map(escape, args))
462        else:
463            args = escape(args)
464        return Markup(unicode.__mod__(self, args))
465
466    def __mul__(self, num):
467        return Markup(unicode.__mul__(self, num))
468    __rmul__ = __mul__
469
470    def __repr__(self):
471        return "<%s %s>" % (type(self).__name__, unicode.__repr__(self))
472
473    def join(self, seq, escape_quotes=True):
474        """Return a `Markup` object which is the concatenation of the strings
475        in the given sequence, where this `Markup` object is the separator
476        between the joined elements.
477       
478        Any element in the sequence that is not a `Markup` instance is
479        automatically escaped.
480       
481        :param seq: the sequence of strings to join
482        :param escape_quotes: whether double quote characters in the elements
483                              should be escaped
484        :return: the joined `Markup` object
485        :rtype: `Markup`
486        :see: `escape`
487        """
488        return Markup(unicode.join(self, [escape(item, quotes=escape_quotes)
489                                          for item in seq]))
490
491    @classmethod
492    def escape(cls, text, quotes=True):
493        """Create a Markup instance from a string and escape special characters
494        it may contain (<, >, & and \").
495       
496        >>> escape('"1 < 2"')
497        <Markup u'&#34;1 &lt; 2&#34;'>
498       
499        If the `quotes` parameter is set to `False`, the \" character is left
500        as is. Escaping quotes is generally only required for strings that are
501        to be used in attribute values.
502       
503        >>> escape('"1 < 2"', quotes=False)
504        <Markup u'"1 &lt; 2"'>
505       
506        :param text: the text to escape
507        :param quotes: if ``True``, double quote characters are escaped in
508                       addition to the other special characters
509        :return: the escaped `Markup` string
510        :rtype: `Markup`
511        """
512        if not text:
513            return cls()
514        if type(text) is cls:
515            return text
516        if hasattr(text, '__html__'):
517            return cls(text.__html__())
518
519        text = text.replace('&', '&amp;') \
520                   .replace('<', '&lt;') \
521                   .replace('>', '&gt;')
522        if quotes:
523            text = text.replace('"', '&#34;')
524        return cls(text)
525
526    def unescape(self):
527        """Reverse-escapes &, <, >, and \" and returns a `unicode` object.
528       
529        >>> Markup('1 &lt; 2').unescape()
530        u'1 < 2'
531       
532        :return: the unescaped string
533        :rtype: `unicode`
534        :see: `genshi.core.unescape`
535        """
536        if not self:
537            return ''
538        return unicode(self).replace('&#34;', '"') \
539                            .replace('&gt;', '>') \
540                            .replace('&lt;', '<') \
541                            .replace('&amp;', '&')
542
543    def stripentities(self, keepxmlentities=False):
544        """Return a copy of the text with any character or numeric entities
545        replaced by the equivalent UTF-8 characters.
546       
547        If the `keepxmlentities` parameter is provided and evaluates to `True`,
548        the core XML entities (``&amp;``, ``&apos;``, ``&gt;``, ``&lt;`` and
549        ``&quot;``) are not stripped.
550       
551        :return: a `Markup` instance with entities removed
552        :rtype: `Markup`
553        :see: `genshi.util.stripentities`
554        """
555        return Markup(stripentities(self, keepxmlentities=keepxmlentities))
556
557    def striptags(self):
558        """Return a copy of the text with all XML/HTML tags removed.
559       
560        :return: a `Markup` instance with all tags removed
561        :rtype: `Markup`
562        :see: `genshi.util.striptags`
563        """
564        return Markup(striptags(self))
565
566
567try:
568    from genshi._speedups import Markup
569except ImportError:
570    pass # just use the Python implementation
571
572
573escape = Markup.escape
574
575
576def unescape(text):
577    """Reverse-escapes &, <, >, and \" and returns a `unicode` object.
578   
579    >>> unescape(Markup('1 &lt; 2'))
580    u'1 < 2'
581   
582    If the provided `text` object is not a `Markup` instance, it is returned
583    unchanged.
584   
585    >>> unescape('1 &lt; 2')
586    '1 &lt; 2'
587   
588    :param text: the text to unescape
589    :return: the unescsaped string
590    :rtype: `unicode`
591    """
592    if not isinstance(text, Markup):
593        return text
594    return text.unescape()
595
596
597class Namespace(object):
598    """Utility class creating and testing elements with a namespace.
599   
600    Internally, namespace URIs are encoded in the `QName` of any element or
601    attribute, the namespace URI being enclosed in curly braces. This class
602    helps create and test these strings.
603   
604    A `Namespace` object is instantiated with the namespace URI.
605   
606    >>> html = Namespace('http://www.w3.org/1999/xhtml')
607    >>> html
608    Namespace('http://www.w3.org/1999/xhtml')
609    >>> html.uri
610    u'http://www.w3.org/1999/xhtml'
611   
612    The `Namespace` object can than be used to generate `QName` objects with
613    that namespace:
614   
615    >>> html.body
616    QName('http://www.w3.org/1999/xhtml}body')
617    >>> html.body.localname
618    u'body'
619    >>> html.body.namespace
620    u'http://www.w3.org/1999/xhtml'
621   
622    The same works using item access notation, which is useful for element or
623    attribute names that are not valid Python identifiers:
624   
625    >>> html['body']
626    QName('http://www.w3.org/1999/xhtml}body')
627   
628    A `Namespace` object can also be used to test whether a specific `QName`
629    belongs to that namespace using the ``in`` operator:
630   
631    >>> qname = html.body
632    >>> qname in html
633    True
634    >>> qname in Namespace('http://www.w3.org/2002/06/xhtml2')
635    False
636    """
637    def __new__(cls, uri):
638        if type(uri) is cls:
639            return uri
640        return object.__new__(cls)
641
642    def __getnewargs__(self):
643        return (self.uri,)
644
645    def __getstate__(self):
646        return self.uri
647
648    def __setstate__(self, uri):
649        self.uri = uri
650
651    def __init__(self, uri):
652        self.uri = unicode(uri)
653
654    def __contains__(self, qname):
655        return qname.namespace == self.uri
656
657    def __ne__(self, other):
658        return not self == other
659
660    def __eq__(self, other):
661        if isinstance(other, Namespace):
662            return self.uri == other.uri
663        return self.uri == other
664
665    def __getitem__(self, name):
666        return QName(self.uri + '}' + name)
667    __getattr__ = __getitem__
668
669    def __hash__(self):
670        return hash(self.uri)
671
672    if sys.version_info[0] == 2:
673        # Only use stringrepr in python 2
674        def __repr__(self):
675            return '%s(%s)' % (type(self).__name__, stringrepr(self.uri))
676    else:
677        def __repr__(self):
678            return '%s(%r)' % (type(self).__name__, self.uri)
679
680    def __str__(self):
681        return self.uri.encode('utf-8')
682
683    def __unicode__(self):
684        return self.uri
685
686
687# The namespace used by attributes such as xml:lang and xml:space
688XML_NAMESPACE = Namespace('http://www.w3.org/XML/1998/namespace')
689
690
691class QName(unicode):
692    """A qualified element or attribute name.
693   
694    The unicode value of instances of this class contains the qualified name of
695    the element or attribute, in the form ``{namespace-uri}local-name``. The
696    namespace URI can be obtained through the additional `namespace` attribute,
697    while the local name can be accessed through the `localname` attribute.
698   
699    >>> qname = QName('foo')
700    >>> qname
701    QName('foo')
702    >>> qname.localname
703    u'foo'
704    >>> qname.namespace
705   
706    >>> qname = QName('http://www.w3.org/1999/xhtml}body')
707    >>> qname
708    QName('http://www.w3.org/1999/xhtml}body')
709    >>> qname.localname
710    u'body'
711    >>> qname.namespace
712    u'http://www.w3.org/1999/xhtml'
713    """
714    __slots__ = ['namespace', 'localname']
715
716    def __new__(cls, qname):
717        """Create the `QName` instance.
718       
719        :param qname: the qualified name as a string of the form
720                      ``{namespace-uri}local-name``, where the leading curly
721                      brace is optional
722        """
723        if type(qname) is cls:
724            return qname
725
726        qname = qname.lstrip('{')
727        parts = qname.split('}', 1)
728        if len(parts) > 1:
729            self = unicode.__new__(cls, '{%s' % qname)
730            self.namespace, self.localname = map(unicode, parts)
731        else:
732            self = unicode.__new__(cls, qname)
733            self.namespace, self.localname = None, unicode(qname)
734        return self
735
736    def __getnewargs__(self):
737        return (self.lstrip('{'),)
738
739    if sys.version_info[0] == 2:
740        # Only use stringrepr in python 2
741        def __repr__(self):
742            return '%s(%s)' % (type(self).__name__, stringrepr(self.lstrip('{')))
743    else:
744        def __repr__(self):
745            return '%s(%r)' % (type(self).__name__, self.lstrip('{'))
Note: See TracBrowser for help on using the repository browser.