Edgewall Software

source: tags/0.3.0/genshi/core.py

Last change on this file was 299, checked in by cmlenz, 17 years ago

Fixed EOL style.

  • Property svn:eol-style set to native
File size: 18.5 KB
Line 
1# -*- coding: utf-8 -*-
2#
3# Copyright (C) 2006 Edgewall Software
4# All rights reserved.
5#
6# This software is licensed as described in the file COPYING, which
7# you should have received as part of this distribution. The terms
8# are also available at http://genshi.edgewall.org/wiki/License.
9#
10# This software consists of voluntary contributions made by many
11# individuals. For the exact contribution history, see the revision
12# history and logs, available at http://genshi.edgewall.org/log/.
13
14"""Core classes for markup processing."""
15
16import htmlentitydefs
17import operator
18import re
19
20__all__ = ['Stream', 'Markup', 'escape', 'unescape', 'Namespace', 'QName']
21
22
23class StreamEventKind(str):
24    """A kind of event on an XML stream."""
25
26
27class Stream(object):
28    """Represents a stream of markup events.
29   
30    This class is basically an iterator over the events.
31   
32    Also provided are ways to serialize the stream to text. The `serialize()`
33    method will return an iterator over generated strings, while `render()`
34    returns the complete generated text at once. Both accept various parameters
35    that impact the way the stream is serialized.
36   
37    Stream events are tuples of the form:
38
39      (kind, data, position)
40
41    where `kind` is the event kind (such as `START`, `END`, `TEXT`, etc), `data`
42    depends on the kind of event, and `position` is a `(filename, line, offset)`
43    tuple that contains the location of the original element or text in the
44    input. If the original location is unknown, `position` is `(None, -1, -1)`.
45    """
46    __slots__ = ['events']
47
48    START = StreamEventKind('START') # a start tag
49    END = StreamEventKind('END') # an end tag
50    TEXT = StreamEventKind('TEXT') # literal text
51    DOCTYPE = StreamEventKind('DOCTYPE') # doctype declaration
52    START_NS = StreamEventKind('START_NS') # start namespace mapping
53    END_NS = StreamEventKind('END_NS') # end namespace mapping
54    START_CDATA = StreamEventKind('START_CDATA') # start CDATA section
55    END_CDATA = StreamEventKind('END_CDATA') # end CDATA section
56    PI = StreamEventKind('PI') # processing instruction
57    COMMENT = StreamEventKind('COMMENT') # comment
58
59    def __init__(self, events):
60        """Initialize the stream with a sequence of markup events.
61       
62        @param events: a sequence or iterable providing the events
63        """
64        self.events = events
65
66    def __iter__(self):
67        return iter(self.events)
68
69    def __or__(self, function):
70        """Override the "bitwise or" operator to apply filters or serializers
71        to the stream, providing a syntax similar to pipes on Unix shells.
72       
73        Assume the following stream produced by the `HTML` function:
74       
75        >>> from genshi.input import HTML
76        >>> html = HTML('''<p onclick="alert('Whoa')">Hello, world!</p>''')
77        >>> print html
78        <p onclick="alert('Whoa')">Hello, world!</p>
79       
80        A filter such as the HTML sanitizer can be applied to that stream using
81        the pipe notation as follows:
82       
83        >>> from genshi.filters import HTMLSanitizer
84        >>> sanitizer = HTMLSanitizer()
85        >>> print html | sanitizer
86        <p>Hello, world!</p>
87       
88        Filters can be any function that accepts and produces a stream (where
89        a stream is anything that iterators over events):
90       
91        >>> def uppercase(stream):
92        ...     for kind, data, pos in stream:
93        ...         if kind is TEXT:
94        ...             data = data.upper()
95        ...         yield kind, data, pos
96        >>> print html | sanitizer | uppercase
97        <p>HELLO, WORLD!</p>
98       
99        Serializers can also be used with this notation:
100       
101        >>> from genshi.output import TextSerializer
102        >>> output = TextSerializer()
103        >>> print html | sanitizer | uppercase | output
104        HELLO, WORLD!
105       
106        Commonly, serializers should be used at the end of the "pipeline";
107        using them somewhere in the middle may produce unexpected results.
108        """
109        return Stream(_ensure(function(self)))
110
111    def filter(self, *filters):
112        """Apply filters to the stream.
113       
114        This method returns a new stream with the given filters applied. The
115        filters must be callables that accept the stream object as parameter,
116        and return the filtered stream.
117       
118        The call:
119       
120            stream.filter(filter1, filter2)
121       
122        is equivalent to:
123       
124            stream | filter1 | filter2
125        """
126        return reduce(operator.or_, (self,) + filters)
127
128    def render(self, method='xml', encoding='utf-8', **kwargs):
129        """Return a string representation of the stream.
130       
131        @param method: determines how the stream is serialized; can be either
132                       "xml", "xhtml", "html", "text", or a custom serializer
133                       class
134        @param encoding: how the output string should be encoded; if set to
135                         `None`, this method returns a `unicode` object
136
137        Any additional keyword arguments are passed to the serializer, and thus
138        depend on the `method` parameter value.
139        """
140        generator = self.serialize(method=method, **kwargs)
141        output = u''.join(list(generator))
142        if encoding is not None:
143            errors = 'replace'
144            if method != 'text':
145                errors = 'xmlcharrefreplace'
146            return output.encode(encoding, errors)
147        return output
148
149    def select(self, path):
150        """Return a new stream that contains the events matching the given
151        XPath expression.
152       
153        @param path: a string containing the XPath expression
154        """
155        from genshi.path import Path
156        return Path(path).select(self)
157
158    def serialize(self, method='xml', **kwargs):
159        """Generate strings corresponding to a specific serialization of the
160        stream.
161       
162        Unlike the `render()` method, this method is a generator that returns
163        the serialized output incrementally, as opposed to returning a single
164        string.
165       
166        @param method: determines how the stream is serialized; can be either
167                       "xml", "xhtml", "html", "text", or a custom serializer
168                       class
169
170        Any additional keyword arguments are passed to the serializer, and thus
171        depend on the `method` parameter value.
172        """
173        from genshi import output
174        cls = method
175        if isinstance(method, basestring):
176            cls = {'xml':   output.XMLSerializer,
177                   'xhtml': output.XHTMLSerializer,
178                   'html':  output.HTMLSerializer,
179                   'text':  output.TextSerializer}[method]
180        return cls(**kwargs)(_ensure(self))
181
182    def __str__(self):
183        return self.render()
184
185    def __unicode__(self):
186        return self.render(encoding=None)
187
188
189START = Stream.START
190END = Stream.END
191TEXT = Stream.TEXT
192DOCTYPE = Stream.DOCTYPE
193START_NS = Stream.START_NS
194END_NS = Stream.END_NS
195START_CDATA = Stream.START_CDATA
196END_CDATA = Stream.END_CDATA
197PI = Stream.PI
198COMMENT = Stream.COMMENT
199
200def _ensure(stream):
201    """Ensure that every item on the stream is actually a markup event."""
202    for event in stream:
203        if type(event) is not tuple:
204            if hasattr(event, 'totuple'):
205                event = event.totuple()
206            else:
207                event = TEXT, unicode(event), (None, -1, -1)
208        yield event
209
210
211class Attrs(list):
212    """Sequence type that stores the attributes of an element.
213   
214    The order of the attributes is preserved, while accessing and manipulating
215    attributes by name is also supported.
216   
217    >>> attrs = Attrs([('href', '#'), ('title', 'Foo')])
218    >>> attrs
219    [(u'href', '#'), (u'title', 'Foo')]
220   
221    >>> 'href' in attrs
222    True
223    >>> 'tabindex' in attrs
224    False
225   
226    >>> attrs.get(u'title')
227    'Foo'
228    >>> attrs.set(u'title', 'Bar')
229    >>> attrs
230    [(u'href', '#'), (u'title', 'Bar')]
231    >>> attrs.remove(u'title')
232    >>> attrs
233    [(u'href', '#')]
234   
235    New attributes added using the `set()` method are appended to the end of
236    the list:
237   
238    >>> attrs.set(u'accesskey', 'k')
239    >>> attrs
240    [(u'href', '#'), (u'accesskey', 'k')]
241   
242    An `Attrs` instance can also be initialized with keyword arguments.
243   
244    >>> attrs = Attrs(class_='bar', href='#', title='Foo')
245    >>> attrs.get('class')
246    'bar'
247    >>> attrs.get('href')
248    '#'
249    >>> attrs.get('title')
250    'Foo'
251   
252    Reserved words can be used by appending a trailing underscore to the name,
253    and any other underscore is replaced by a dash:
254   
255    >>> attrs = Attrs(class_='bar', accept_charset='utf-8')
256    >>> attrs.get('class')
257    'bar'
258    >>> attrs.get('accept-charset')
259    'utf-8'
260   
261    Thus this shorthand can not be used if attribute names should contain
262    actual underscore characters.
263    """
264    __slots__ = []
265
266    def __init__(self, attrib=None, **kwargs):
267        """Create the `Attrs` instance.
268       
269        If the `attrib` parameter is provided, it is expected to be a sequence
270        of `(name, value)` tuples.
271        """
272        if attrib is None:
273            attrib = []
274        list.__init__(self, [(QName(name), value) for name, value in attrib])
275        for name, value in kwargs.items():
276            self.set(name.rstrip('_').replace('_', '-'), value)
277
278    def __contains__(self, name):
279        """Return whether the list includes an attribute with the specified
280        name.
281        """
282        for attr, _ in self:
283            if attr == name:
284                return True
285
286    def get(self, name, default=None):
287        """Return the value of the attribute with the specified name, or the
288        value of the `default` parameter if no such attribute is found.
289        """
290        for attr, value in self:
291            if attr == name:
292                return value
293        return default
294
295    def remove(self, name):
296        """Remove the attribute with the specified name.
297       
298        If no such attribute is found, this method does nothing.
299        """
300        for idx, (attr, _) in enumerate(self):
301            if attr == name:
302                del self[idx]
303                break
304
305    def set(self, name, value):
306        """Set the specified attribute to the given value.
307       
308        If an attribute with the specified name is already in the list, the
309        value of the existing entry is updated. Otherwise, a new attribute is
310        appended to the end of the list.
311        """
312        for idx, (attr, _) in enumerate(self):
313            if attr == name:
314                self[idx] = (QName(attr), value)
315                break
316        else:
317            self.append((QName(name), value))
318
319    def totuple(self):
320        """Return the attributes as a markup event.
321       
322        The returned event is a TEXT event, the data is the value of all
323        attributes joined together.
324        """
325        return TEXT, u''.join([x[1] for x in self]), (None, -1, -1)
326
327
328def plaintext(text, keeplinebreaks=True):
329    """Returns the text as a `unicode` string with all entities and tags
330    removed.
331    """
332    text = stripentities(striptags(text))
333    if not keeplinebreaks:
334        text = text.replace(u'\n', u' ')
335    return text
336
337def stripentities(text, keepxmlentities=False):
338    """Return a copy of the given text with any character or numeric entities
339    replaced by the equivalent UTF-8 characters.
340   
341    If the `keepxmlentities` parameter is provided and evaluates to `True`,
342    the core XML entities (&amp;, &apos;, &gt;, &lt; and &quot;) are not
343    stripped.
344    """
345    def _replace_entity(match):
346        if match.group(1): # numeric entity
347            ref = match.group(1)
348            if ref.startswith('x'):
349                ref = int(ref[1:], 16)
350            else:
351                ref = int(ref, 10)
352            return unichr(ref)
353        else: # character entity
354            ref = match.group(2)
355            if keepxmlentities and ref in ('amp', 'apos', 'gt', 'lt', 'quot'):
356                return '&%s;' % ref
357            try:
358                codepoint = htmlentitydefs.name2codepoint[ref]
359                return unichr(codepoint)
360            except KeyError:
361                if keepxmlentities:
362                    return '&amp;%s;' % ref
363                else:
364                    return ref
365    return re.sub(r'&(?:#((?:\d+)|(?:[xX][0-9a-fA-F]+));?|(\w+);)',
366                  _replace_entity, text)
367
368def striptags(text):
369    """Return a copy of the text with all XML/HTML tags removed."""
370    return re.sub(r'<[^>]*?>', '', text)
371
372
373class Markup(unicode):
374    """Marks a string as being safe for inclusion in HTML/XML output without
375    needing to be escaped.
376    """
377    __slots__ = []
378
379    def __new__(cls, text='', *args):
380        if args:
381            text %= tuple(map(escape, args))
382        return unicode.__new__(cls, text)
383
384    def __add__(self, other):
385        return Markup(unicode(self) + unicode(escape(other)))
386
387    def __radd__(self, other):
388        return Markup(unicode(escape(other)) + unicode(self))
389
390    def __mod__(self, args):
391        if not isinstance(args, (list, tuple)):
392            args = [args]
393        return Markup(unicode.__mod__(self, tuple(map(escape, args))))
394
395    def __mul__(self, num):
396        return Markup(unicode(self) * num)
397
398    def __rmul__(self, num):
399        return Markup(num * unicode(self))
400
401    def __repr__(self):
402        return '<%s "%s">' % (self.__class__.__name__, self)
403
404    def join(self, seq, escape_quotes=True):
405        return Markup(unicode(self).join([escape(item, quotes=escape_quotes)
406                                          for item in seq]))
407
408    def escape(cls, text, quotes=True):
409        """Create a Markup instance from a string and escape special characters
410        it may contain (<, >, & and \").
411       
412        If the `quotes` parameter is set to `False`, the \" character is left
413        as is. Escaping quotes is generally only required for strings that are
414        to be used in attribute values.
415        """
416        if not text:
417            return cls()
418        if type(text) is cls:
419            return text
420        text = unicode(text).replace('&', '&amp;') \
421                            .replace('<', '&lt;') \
422                            .replace('>', '&gt;')
423        if quotes:
424            text = text.replace('"', '&#34;')
425        return cls(text)
426    escape = classmethod(escape)
427
428    def unescape(self):
429        """Reverse-escapes &, <, > and \" and returns a `unicode` object."""
430        if not self:
431            return u''
432        return unicode(self).replace('&#34;', '"') \
433                            .replace('&gt;', '>') \
434                            .replace('&lt;', '<') \
435                            .replace('&amp;', '&')
436
437    def stripentities(self, keepxmlentities=False):
438        """Return a copy of the text with any character or numeric entities
439        replaced by the equivalent UTF-8 characters.
440       
441        If the `keepxmlentities` parameter is provided and evaluates to `True`,
442        the core XML entities (&amp;, &apos;, &gt;, &lt; and &quot;) are not
443        stripped.
444        """
445        return Markup(stripentities(self, keepxmlentities=keepxmlentities))
446
447    def striptags(self):
448        """Return a copy of the text with all XML/HTML tags removed."""
449        return Markup(striptags(self))
450
451
452escape = Markup.escape
453
454def unescape(text):
455    """Reverse-escapes &, <, > and \" and returns a `unicode` object."""
456    if not isinstance(text, Markup):
457        return text
458    return text.unescape()
459
460
461class Namespace(object):
462    """Utility class creating and testing elements with a namespace.
463   
464    Internally, namespace URIs are encoded in the `QName` of any element or
465    attribute, the namespace URI being enclosed in curly braces. This class
466    helps create and test these strings.
467   
468    A `Namespace` object is instantiated with the namespace URI.
469   
470    >>> html = Namespace('http://www.w3.org/1999/xhtml')
471    >>> html
472    <Namespace "http://www.w3.org/1999/xhtml">
473    >>> html.uri
474    u'http://www.w3.org/1999/xhtml'
475   
476    The `Namespace` object can than be used to generate `QName` objects with
477    that namespace:
478   
479    >>> html.body
480    u'{http://www.w3.org/1999/xhtml}body'
481    >>> html.body.localname
482    u'body'
483    >>> html.body.namespace
484    u'http://www.w3.org/1999/xhtml'
485   
486    The same works using item access notation, which is useful for element or
487    attribute names that are not valid Python identifiers:
488   
489    >>> html['body']
490    u'{http://www.w3.org/1999/xhtml}body'
491   
492    A `Namespace` object can also be used to test whether a specific `QName`
493    belongs to that namespace using the `in` operator:
494   
495    >>> qname = html.body
496    >>> qname in html
497    True
498    >>> qname in Namespace('http://www.w3.org/2002/06/xhtml2')
499    False
500    """
501    def __new__(cls, uri):
502        if type(uri) is cls:
503            return uri
504        return object.__new__(cls, uri)
505
506    def __init__(self, uri):
507        self.uri = unicode(uri)
508
509    def __contains__(self, qname):
510        return qname.namespace == self.uri
511
512    def __eq__(self, other):
513        if isinstance(other, Namespace):
514            return self.uri == other.uri
515        return self.uri == other
516
517    def __getitem__(self, name):
518        return QName(self.uri + u'}' + name)
519    __getattr__ = __getitem__
520
521    def __repr__(self):
522        return '<Namespace "%s">' % self.uri
523
524    def __str__(self):
525        return self.uri.encode('utf-8')
526
527    def __unicode__(self):
528        return self.uri
529
530
531# The namespace used by attributes such as xml:lang and xml:space
532XML_NAMESPACE = Namespace('http://www.w3.org/XML/1998/namespace')
533
534
535class QName(unicode):
536    """A qualified element or attribute name.
537   
538    The unicode value of instances of this class contains the qualified name of
539    the element or attribute, in the form `{namespace}localname`. The namespace
540    URI can be obtained through the additional `namespace` attribute, while the
541    local name can be accessed through the `localname` attribute.
542   
543    >>> qname = QName('foo')
544    >>> qname
545    u'foo'
546    >>> qname.localname
547    u'foo'
548    >>> qname.namespace
549   
550    >>> qname = QName('http://www.w3.org/1999/xhtml}body')
551    >>> qname
552    u'{http://www.w3.org/1999/xhtml}body'
553    >>> qname.localname
554    u'body'
555    >>> qname.namespace
556    u'http://www.w3.org/1999/xhtml'
557    """
558    __slots__ = ['namespace', 'localname']
559
560    def __new__(cls, qname):
561        if type(qname) is cls:
562            return qname
563
564        parts = qname.split(u'}', 1)
565        if len(parts) > 1:
566            self = unicode.__new__(cls, u'{%s' % qname)
567            self.namespace, self.localname = map(unicode, parts)
568        else:
569            self = unicode.__new__(cls, qname)
570            self.namespace, self.localname = None, unicode(qname)
571        return self
Note: See TracBrowser for help on using the repository browser.