Edgewall Software

source: tags/0.5.0/genshi/core.py

Last change on this file was 861, checked in by cmlenz, 15 years ago

Implement the __html__ protocol as suggested in #202. This would allow Genshi to be used in combination with other markup generating tools, as long as they support the same protocol.

  • Property svn:eol-style set to native
File size: 23.9 KB
Line 
1# -*- coding: utf-8 -*-
2#
3# Copyright (C) 2006-2008 Edgewall Software
4# All rights reserved.
5#
6# This software is licensed as described in the file COPYING, which
7# you should have received as part of this distribution. The terms
8# are also available at http://genshi.edgewall.org/wiki/License.
9#
10# This software consists of voluntary contributions made by many
11# individuals. For the exact contribution history, see the revision
12# history and logs, available at http://genshi.edgewall.org/log/.
13
14"""Core classes for markup processing."""
15
16from itertools import chain
17import operator
18
19from genshi.util import plaintext, stripentities, striptags
20
21__all__ = ['Stream', 'Markup', 'escape', 'unescape', 'Attrs', 'Namespace',
22           'QName']
23__docformat__ = 'restructuredtext en'
24
25
26class StreamEventKind(str):
27    """A kind of event on a markup stream."""
28    __slots__ = []
29    _instances = {}
30
31    def __new__(cls, val):
32        return cls._instances.setdefault(val, str.__new__(cls, val))
33
34
35class Stream(object):
36    """Represents a stream of markup events.
37   
38    This class is basically an iterator over the events.
39   
40    Stream events are tuples of the form::
41   
42      (kind, data, position)
43   
44    where ``kind`` is the event kind (such as `START`, `END`, `TEXT`, etc),
45    ``data`` depends on the kind of event, and ``position`` is a
46    ``(filename, line, offset)`` tuple that contains the location of the
47    original element or text in the input. If the original location is unknown,
48    ``position`` is ``(None, -1, -1)``.
49   
50    Also provided are ways to serialize the stream to text. The `serialize()`
51    method will return an iterator over generated strings, while `render()`
52    returns the complete generated text at once. Both accept various parameters
53    that impact the way the stream is serialized.
54    """
55    __slots__ = ['events', 'serializer']
56
57    START = StreamEventKind('START') #: a start tag
58    END = StreamEventKind('END') #: an end tag
59    TEXT = StreamEventKind('TEXT') #: literal text
60    XML_DECL = StreamEventKind('XML_DECL') #: XML declaration
61    DOCTYPE = StreamEventKind('DOCTYPE') #: doctype declaration
62    START_NS = StreamEventKind('START_NS') #: start namespace mapping
63    END_NS = StreamEventKind('END_NS') #: end namespace mapping
64    START_CDATA = StreamEventKind('START_CDATA') #: start CDATA section
65    END_CDATA = StreamEventKind('END_CDATA') #: end CDATA section
66    PI = StreamEventKind('PI') #: processing instruction
67    COMMENT = StreamEventKind('COMMENT') #: comment
68
69    def __init__(self, events, serializer=None):
70        """Initialize the stream with a sequence of markup events.
71       
72        :param events: a sequence or iterable providing the events
73        :param serializer: the default serialization method to use for this
74                           stream
75
76        :note: Changed in 0.5: added the `serializer` argument
77        """
78        self.events = events #: The underlying iterable producing the events
79        self.serializer = serializer #: The default serializion method
80
81    def __iter__(self):
82        return iter(self.events)
83
84    def __or__(self, function):
85        """Override the "bitwise or" operator to apply filters or serializers
86        to the stream, providing a syntax similar to pipes on Unix shells.
87       
88        Assume the following stream produced by the `HTML` function:
89       
90        >>> from genshi.input import HTML
91        >>> html = HTML('''<p onclick="alert('Whoa')">Hello, world!</p>''')
92        >>> print html
93        <p onclick="alert('Whoa')">Hello, world!</p>
94       
95        A filter such as the HTML sanitizer can be applied to that stream using
96        the pipe notation as follows:
97       
98        >>> from genshi.filters import HTMLSanitizer
99        >>> sanitizer = HTMLSanitizer()
100        >>> print html | sanitizer
101        <p>Hello, world!</p>
102       
103        Filters can be any function that accepts and produces a stream (where
104        a stream is anything that iterates over events):
105       
106        >>> def uppercase(stream):
107        ...     for kind, data, pos in stream:
108        ...         if kind is TEXT:
109        ...             data = data.upper()
110        ...         yield kind, data, pos
111        >>> print html | sanitizer | uppercase
112        <p>HELLO, WORLD!</p>
113       
114        Serializers can also be used with this notation:
115       
116        >>> from genshi.output import TextSerializer
117        >>> output = TextSerializer()
118        >>> print html | sanitizer | uppercase | output
119        HELLO, WORLD!
120       
121        Commonly, serializers should be used at the end of the "pipeline";
122        using them somewhere in the middle may produce unexpected results.
123       
124        :param function: the callable object that should be applied as a filter
125        :return: the filtered stream
126        :rtype: `Stream`
127        """
128        return Stream(_ensure(function(self)), serializer=self.serializer)
129
130    def filter(self, *filters):
131        """Apply filters to the stream.
132       
133        This method returns a new stream with the given filters applied. The
134        filters must be callables that accept the stream object as parameter,
135        and return the filtered stream.
136       
137        The call::
138       
139            stream.filter(filter1, filter2)
140       
141        is equivalent to::
142       
143            stream | filter1 | filter2
144       
145        :param filters: one or more callable objects that should be applied as
146                        filters
147        :return: the filtered stream
148        :rtype: `Stream`
149        """
150        return reduce(operator.or_, (self,) + filters)
151
152    def render(self, method=None, encoding='utf-8', out=None, **kwargs):
153        """Return a string representation of the stream.
154       
155        Any additional keyword arguments are passed to the serializer, and thus
156        depend on the `method` parameter value.
157       
158        :param method: determines how the stream is serialized; can be either
159                       "xml", "xhtml", "html", "text", or a custom serializer
160                       class; if `None`, the default serialization method of
161                       the stream is used
162        :param encoding: how the output string should be encoded; if set to
163                         `None`, this method returns a `unicode` object
164        :param out: a file-like object that the output should be written to
165                    instead of being returned as one big string; note that if
166                    this is a file or socket (or similar), the `encoding` must
167                    not be `None` (that is, the output must be encoded)
168        :return: a `str` or `unicode` object (depending on the `encoding`
169                 parameter), or `None` if the `out` parameter is provided
170        :rtype: `basestring`
171       
172        :see: XMLSerializer, XHTMLSerializer, HTMLSerializer, TextSerializer
173        :note: Changed in 0.5: added the `out` parameter
174        """
175        from genshi.output import encode
176        if method is None:
177            method = self.serializer or 'xml'
178        generator = self.serialize(method=method, **kwargs)
179        return encode(generator, method=method, encoding=encoding, out=out)
180
181    def select(self, path, namespaces=None, variables=None):
182        """Return a new stream that contains the events matching the given
183        XPath expression.
184       
185        >>> from genshi import HTML
186        >>> stream = HTML('<doc><elem>foo</elem><elem>bar</elem></doc>')
187        >>> print stream.select('elem')
188        <elem>foo</elem><elem>bar</elem>
189        >>> print stream.select('elem/text()')
190        foobar
191       
192        Note that the outermost element of the stream becomes the *context
193        node* for the XPath test. That means that the expression "doc" would
194        not match anything in the example above, because it only tests against
195        child elements of the outermost element:
196       
197        >>> print stream.select('doc')
198        <BLANKLINE>
199       
200        You can use the "." expression to match the context node itself
201        (although that usually makes little sense):
202       
203        >>> print stream.select('.')
204        <doc><elem>foo</elem><elem>bar</elem></doc>
205       
206        :param path: a string containing the XPath expression
207        :param namespaces: mapping of namespace prefixes used in the path
208        :param variables: mapping of variable names to values
209        :return: the selected substream
210        :rtype: `Stream`
211        :raises PathSyntaxError: if the given path expression is invalid or not
212                                 supported
213        """
214        from genshi.path import Path
215        return Path(path).select(self, namespaces, variables)
216
217    def serialize(self, method='xml', **kwargs):
218        """Generate strings corresponding to a specific serialization of the
219        stream.
220       
221        Unlike the `render()` method, this method is a generator that returns
222        the serialized output incrementally, as opposed to returning a single
223        string.
224       
225        Any additional keyword arguments are passed to the serializer, and thus
226        depend on the `method` parameter value.
227       
228        :param method: determines how the stream is serialized; can be either
229                       "xml", "xhtml", "html", "text", or a custom serializer
230                       class; if `None`, the default serialization method of
231                       the stream is used
232        :return: an iterator over the serialization results (`Markup` or
233                 `unicode` objects, depending on the serialization method)
234        :rtype: ``iterator``
235        :see: XMLSerializer, XHTMLSerializer, HTMLSerializer, TextSerializer
236        """
237        from genshi.output import get_serializer
238        if method is None:
239            method = self.serializer or 'xml'
240        return get_serializer(method, **kwargs)(_ensure(self))
241
242    def __str__(self):
243        return self.render()
244
245    def __unicode__(self):
246        return self.render(encoding=None)
247
248    def __html__(self):
249        return self
250
251
252START = Stream.START
253END = Stream.END
254TEXT = Stream.TEXT
255XML_DECL = Stream.XML_DECL
256DOCTYPE = Stream.DOCTYPE
257START_NS = Stream.START_NS
258END_NS = Stream.END_NS
259START_CDATA = Stream.START_CDATA
260END_CDATA = Stream.END_CDATA
261PI = Stream.PI
262COMMENT = Stream.COMMENT
263
264def _ensure(stream):
265    """Ensure that every item on the stream is actually a markup event."""
266    stream = iter(stream)
267    event = stream.next()
268
269    # Check whether the iterable is a real markup event stream by examining the
270    # first item it yields; if it's not we'll need to do some conversion
271    if type(event) is not tuple or len(event) != 3:
272        for event in chain([event], stream):
273            if hasattr(event, 'totuple'):
274                event = event.totuple()
275            else:
276                event = TEXT, unicode(event), (None, -1, -1)
277            yield event
278        return
279
280    # This looks like a markup event stream, so we'll just pass it through
281    # unchanged
282    yield event
283    for event in stream:
284        yield event
285
286
287class Attrs(tuple):
288    """Immutable sequence type that stores the attributes of an element.
289   
290    Ordering of the attributes is preserved, while access by name is also
291    supported.
292   
293    >>> attrs = Attrs([('href', '#'), ('title', 'Foo')])
294    >>> attrs
295    Attrs([('href', '#'), ('title', 'Foo')])
296   
297    >>> 'href' in attrs
298    True
299    >>> 'tabindex' in attrs
300    False
301    >>> attrs.get('title')
302    'Foo'
303   
304    Instances may not be manipulated directly. Instead, the operators ``|`` and
305    ``-`` can be used to produce new instances that have specific attributes
306    added, replaced or removed.
307   
308    To remove an attribute, use the ``-`` operator. The right hand side can be
309    either a string or a set/sequence of strings, identifying the name(s) of
310    the attribute(s) to remove:
311   
312    >>> attrs - 'title'
313    Attrs([('href', '#')])
314    >>> attrs - ('title', 'href')
315    Attrs()
316   
317    The original instance is not modified, but the operator can of course be
318    used with an assignment:
319
320    >>> attrs
321    Attrs([('href', '#'), ('title', 'Foo')])
322    >>> attrs -= 'title'
323    >>> attrs
324    Attrs([('href', '#')])
325   
326    To add a new attribute, use the ``|`` operator, where the right hand value
327    is a sequence of ``(name, value)`` tuples (which includes `Attrs`
328    instances):
329   
330    >>> attrs | [('title', 'Bar')]
331    Attrs([('href', '#'), ('title', 'Bar')])
332   
333    If the attributes already contain an attribute with a given name, the value
334    of that attribute is replaced:
335   
336    >>> attrs | [('href', 'http://example.org/')]
337    Attrs([('href', 'http://example.org/')])
338    """
339    __slots__ = []
340
341    def __contains__(self, name):
342        """Return whether the list includes an attribute with the specified
343        name.
344       
345        :return: `True` if the list includes the attribute
346        :rtype: `bool`
347        """
348        for attr, _ in self:
349            if attr == name:
350                return True
351
352    def __getslice__(self, i, j):
353        """Return a slice of the attributes list.
354       
355        >>> attrs = Attrs([('href', '#'), ('title', 'Foo')])
356        >>> attrs[1:]
357        Attrs([('title', 'Foo')])
358        """
359        return Attrs(tuple.__getslice__(self, i, j))
360
361    def __or__(self, attrs):
362        """Return a new instance that contains the attributes in `attrs` in
363        addition to any already existing attributes.
364       
365        :return: a new instance with the merged attributes
366        :rtype: `Attrs`
367        """
368        repl = dict([(an, av) for an, av in attrs if an in self])
369        return Attrs([(sn, repl.get(sn, sv)) for sn, sv in self] +
370                     [(an, av) for an, av in attrs if an not in self])
371
372    def __repr__(self):
373        if not self:
374            return 'Attrs()'
375        return 'Attrs([%s])' % ', '.join([repr(item) for item in self])
376
377    def __sub__(self, names):
378        """Return a new instance with all attributes with a name in `names` are
379        removed.
380       
381        :param names: the names of the attributes to remove
382        :return: a new instance with the attribute removed
383        :rtype: `Attrs`
384        """
385        if isinstance(names, basestring):
386            names = (names,)
387        return Attrs([(name, val) for name, val in self if name not in names])
388
389    def get(self, name, default=None):
390        """Return the value of the attribute with the specified name, or the
391        value of the `default` parameter if no such attribute is found.
392       
393        :param name: the name of the attribute
394        :param default: the value to return when the attribute does not exist
395        :return: the attribute value, or the `default` value if that attribute
396                 does not exist
397        :rtype: `object`
398        """
399        for attr, value in self:
400            if attr == name:
401                return value
402        return default
403
404    def totuple(self):
405        """Return the attributes as a markup event.
406       
407        The returned event is a `TEXT` event, the data is the value of all
408        attributes joined together.
409       
410        >>> Attrs([('href', '#'), ('title', 'Foo')]).totuple()
411        ('TEXT', u'#Foo', (None, -1, -1))
412       
413        :return: a `TEXT` event
414        :rtype: `tuple`
415        """
416        return TEXT, u''.join([x[1] for x in self]), (None, -1, -1)
417
418
419class Markup(unicode):
420    """Marks a string as being safe for inclusion in HTML/XML output without
421    needing to be escaped.
422    """
423    __slots__ = []
424
425    def __add__(self, other):
426        return Markup(unicode(self) + unicode(escape(other)))
427
428    def __radd__(self, other):
429        return Markup(unicode(escape(other)) + unicode(self))
430
431    def __mod__(self, args):
432        if isinstance(args, dict):
433            args = dict(zip(args.keys(), map(escape, args.values())))
434        elif isinstance(args, (list, tuple)):
435            args = tuple(map(escape, args))
436        else:
437            args = escape(args)
438        return Markup(unicode.__mod__(self, args))
439
440    def __mul__(self, num):
441        return Markup(unicode(self) * num)
442
443    def __rmul__(self, num):
444        return Markup(num * unicode(self))
445
446    def __repr__(self):
447        return '<%s %r>' % (self.__class__.__name__, unicode(self))
448
449    def join(self, seq, escape_quotes=True):
450        """Return a `Markup` object which is the concatenation of the strings
451        in the given sequence, where this `Markup` object is the separator
452        between the joined elements.
453       
454        Any element in the sequence that is not a `Markup` instance is
455        automatically escaped.
456       
457        :param seq: the sequence of strings to join
458        :param escape_quotes: whether double quote characters in the elements
459                              should be escaped
460        :return: the joined `Markup` object
461        :rtype: `Markup`
462        :see: `escape`
463        """
464        return Markup(unicode(self).join([escape(item, quotes=escape_quotes)
465                                          for item in seq]))
466
467    def escape(cls, text, quotes=True):
468        """Create a Markup instance from a string and escape special characters
469        it may contain (<, >, & and \").
470       
471        >>> escape('"1 < 2"')
472        <Markup u'&#34;1 &lt; 2&#34;'>
473       
474        If the `quotes` parameter is set to `False`, the \" character is left
475        as is. Escaping quotes is generally only required for strings that are
476        to be used in attribute values.
477       
478        >>> escape('"1 < 2"', quotes=False)
479        <Markup u'"1 &lt; 2"'>
480       
481        :param text: the text to escape
482        :param quotes: if ``True``, double quote characters are escaped in
483                       addition to the other special characters
484        :return: the escaped `Markup` string
485        :rtype: `Markup`
486        """
487        if not text:
488            return cls()
489        if type(text) is cls:
490            return text
491        if hasattr(text, '__html__'):
492            return Markup(text.__html__())
493
494        text = unicode(text).replace('&', '&amp;') \
495                            .replace('<', '&lt;') \
496                            .replace('>', '&gt;')
497        if quotes:
498            text = text.replace('"', '&#34;')
499        return cls(text)
500    escape = classmethod(escape)
501
502    def unescape(self):
503        """Reverse-escapes &, <, >, and \" and returns a `unicode` object.
504       
505        >>> Markup('1 &lt; 2').unescape()
506        u'1 < 2'
507       
508        :return: the unescaped string
509        :rtype: `unicode`
510        :see: `genshi.core.unescape`
511        """
512        if not self:
513            return u''
514        return unicode(self).replace('&#34;', '"') \
515                            .replace('&gt;', '>') \
516                            .replace('&lt;', '<') \
517                            .replace('&amp;', '&')
518
519    def stripentities(self, keepxmlentities=False):
520        """Return a copy of the text with any character or numeric entities
521        replaced by the equivalent UTF-8 characters.
522       
523        If the `keepxmlentities` parameter is provided and evaluates to `True`,
524        the core XML entities (``&amp;``, ``&apos;``, ``&gt;``, ``&lt;`` and
525        ``&quot;``) are not stripped.
526       
527        :return: a `Markup` instance with entities removed
528        :rtype: `Markup`
529        :see: `genshi.util.stripentities`
530        """
531        return Markup(stripentities(self, keepxmlentities=keepxmlentities))
532
533    def striptags(self):
534        """Return a copy of the text with all XML/HTML tags removed.
535       
536        :return: a `Markup` instance with all tags removed
537        :rtype: `Markup`
538        :see: `genshi.util.striptags`
539        """
540        return Markup(striptags(self))
541
542
543try:
544    from genshi._speedups import Markup
545except ImportError:
546    pass # just use the Python implementation
547
548escape = Markup.escape
549
550def unescape(text):
551    """Reverse-escapes &, <, >, and \" and returns a `unicode` object.
552   
553    >>> unescape(Markup('1 &lt; 2'))
554    u'1 < 2'
555   
556    If the provided `text` object is not a `Markup` instance, it is returned
557    unchanged.
558   
559    >>> unescape('1 &lt; 2')
560    '1 &lt; 2'
561   
562    :param text: the text to unescape
563    :return: the unescsaped string
564    :rtype: `unicode`
565    """
566    if not isinstance(text, Markup):
567        return text
568    return text.unescape()
569
570
571class Namespace(object):
572    """Utility class creating and testing elements with a namespace.
573   
574    Internally, namespace URIs are encoded in the `QName` of any element or
575    attribute, the namespace URI being enclosed in curly braces. This class
576    helps create and test these strings.
577   
578    A `Namespace` object is instantiated with the namespace URI.
579   
580    >>> html = Namespace('http://www.w3.org/1999/xhtml')
581    >>> html
582    <Namespace "http://www.w3.org/1999/xhtml">
583    >>> html.uri
584    u'http://www.w3.org/1999/xhtml'
585   
586    The `Namespace` object can than be used to generate `QName` objects with
587    that namespace:
588   
589    >>> html.body
590    QName(u'http://www.w3.org/1999/xhtml}body')
591    >>> html.body.localname
592    u'body'
593    >>> html.body.namespace
594    u'http://www.w3.org/1999/xhtml'
595   
596    The same works using item access notation, which is useful for element or
597    attribute names that are not valid Python identifiers:
598   
599    >>> html['body']
600    QName(u'http://www.w3.org/1999/xhtml}body')
601   
602    A `Namespace` object can also be used to test whether a specific `QName`
603    belongs to that namespace using the ``in`` operator:
604   
605    >>> qname = html.body
606    >>> qname in html
607    True
608    >>> qname in Namespace('http://www.w3.org/2002/06/xhtml2')
609    False
610    """
611    def __new__(cls, uri):
612        if type(uri) is cls:
613            return uri
614        return object.__new__(cls)
615
616    def __getnewargs__(self):
617        return (self.uri,)
618
619    def __getstate__(self):
620        return self.uri
621
622    def __setstate__(self, uri):
623        self.uri = uri
624
625    def __init__(self, uri):
626        self.uri = unicode(uri)
627
628    def __contains__(self, qname):
629        return qname.namespace == self.uri
630
631    def __ne__(self, other):
632        return not self == other
633
634    def __eq__(self, other):
635        if isinstance(other, Namespace):
636            return self.uri == other.uri
637        return self.uri == other
638
639    def __getitem__(self, name):
640        return QName(self.uri + u'}' + name)
641    __getattr__ = __getitem__
642
643    def __repr__(self):
644        return '<Namespace "%s">' % self.uri
645
646    def __str__(self):
647        return self.uri.encode('utf-8')
648
649    def __unicode__(self):
650        return self.uri
651
652
653# The namespace used by attributes such as xml:lang and xml:space
654XML_NAMESPACE = Namespace('http://www.w3.org/XML/1998/namespace')
655
656
657class QName(unicode):
658    """A qualified element or attribute name.
659   
660    The unicode value of instances of this class contains the qualified name of
661    the element or attribute, in the form ``{namespace-uri}local-name``. The
662    namespace URI can be obtained through the additional `namespace` attribute,
663    while the local name can be accessed through the `localname` attribute.
664   
665    >>> qname = QName('foo')
666    >>> qname
667    QName(u'foo')
668    >>> qname.localname
669    u'foo'
670    >>> qname.namespace
671   
672    >>> qname = QName('http://www.w3.org/1999/xhtml}body')
673    >>> qname
674    QName(u'http://www.w3.org/1999/xhtml}body')
675    >>> qname.localname
676    u'body'
677    >>> qname.namespace
678    u'http://www.w3.org/1999/xhtml'
679    """
680    __slots__ = ['namespace', 'localname']
681
682    def __new__(cls, qname):
683        """Create the `QName` instance.
684       
685        :param qname: the qualified name as a string of the form
686                      ``{namespace-uri}local-name``, where the leading curly
687                      brace is optional
688        """
689        if type(qname) is cls:
690            return qname
691
692        parts = qname.lstrip(u'{').split(u'}', 1)
693        if len(parts) > 1:
694            self = unicode.__new__(cls, u'{%s' % qname)
695            self.namespace, self.localname = map(unicode, parts)
696        else:
697            self = unicode.__new__(cls, qname)
698            self.namespace, self.localname = None, unicode(qname)
699        return self
700
701    def __getnewargs__(self):
702        return (self.lstrip('{'),)
703
704    def __repr__(self):
705        return 'QName(%s)' % unicode.__repr__(self.lstrip('{'))
Note: See TracBrowser for help on using the repository browser.