Edgewall Software

source: trunk/genshi/filters/transform.py

Last change on this file was 1158, checked in by hodgestar, 13 years ago

Merge r1141 from py3k:

add support for python 3 to genshi.filters:

  • minor changes to track encoding=None API change in core genshi modules.
  • renamed genshi/filters/tests/html.py to test_html.py to avoid clashes with Python 3 top-level html module when running tests subset.
  • did not rename genshi/filters/html.py.
  • i18n filters:
    • ugettext and friends are gone in Python 3 (and only gettext and friends exist and they now handle unicode)
    • Some \ line continuations inside doctests confused 2to3 and so were removed them.
    • Testing picked up a problem (already present in trunk) where Translator.call could end up defining gettext as an endlessly recursive function. Noted with a TODO.
  • Property svn:eol-style set to native
File size: 47.1 KB
Line 
1# -*- coding: utf-8 -*-
2#
3# Copyright (C) 2007-2009 Edgewall Software
4# All rights reserved.
5#
6# This software is licensed as described in the file COPYING, which
7# you should have received as part of this distribution. The terms
8# are also available at http://genshi.edgewall.org/wiki/License.
9#
10# This software consists of voluntary contributions made by many
11# individuals. For the exact contribution history, see the revision
12# history and logs, available at http://genshi.edgewall.org/log/.
13
14"""A filter for functional-style transformations of markup streams.
15
16The `Transformer` filter provides a variety of transformations that can be
17applied to parts of streams that match given XPath expressions. These
18transformations can be chained to achieve results that would be comparitively
19tedious to achieve by writing stream filters by hand. The approach of chaining
20node selection and transformation has been inspired by the `jQuery`_ Javascript
21library.
22
23 .. _`jQuery`: http://jquery.com/
24
25For example, the following transformation removes the ``<title>`` element from
26the ``<head>`` of the input document:
27
28>>> from genshi.builder import tag
29>>> html = HTML('''<html>
30...  <head><title>Some Title</title></head>
31...  <body>
32...    Some <em>body</em> text.
33...  </body>
34... </html>''',
35... encoding='utf-8')
36>>> print(html | Transformer('body/em').map(unicode.upper, TEXT)
37...                                    .unwrap().wrap(tag.u))
38<html>
39  <head><title>Some Title</title></head>
40  <body>
41    Some <u>BODY</u> text.
42  </body>
43</html>
44
45The ``Transformer`` support a large number of useful transformations out of the
46box, but custom transformations can be added easily.
47
48:since: version 0.5
49"""
50
51import re
52import sys
53
54from genshi.builder import Element
55from genshi.core import Stream, Attrs, QName, TEXT, START, END, _ensure, Markup
56from genshi.path import Path
57
58__all__ = ['Transformer', 'StreamBuffer', 'InjectorTransformation', 'ENTER',
59           'EXIT', 'INSIDE', 'OUTSIDE', 'BREAK']
60
61
62class TransformMark(str):
63    """A mark on a transformation stream."""
64    __slots__ = []
65    _instances = {}
66
67    def __new__(cls, val):
68        return cls._instances.setdefault(val, str.__new__(cls, val))
69
70
71ENTER = TransformMark('ENTER')
72"""Stream augmentation mark indicating that a selected element is being
73entered."""
74
75INSIDE = TransformMark('INSIDE')
76"""Stream augmentation mark indicating that processing is currently inside a
77selected element."""
78
79OUTSIDE = TransformMark('OUTSIDE')
80"""Stream augmentation mark indicating that a match occurred outside a selected
81element."""
82
83ATTR = TransformMark('ATTR')
84"""Stream augmentation mark indicating a selected element attribute."""
85
86EXIT = TransformMark('EXIT')
87"""Stream augmentation mark indicating that a selected element is being
88exited."""
89
90BREAK = TransformMark('BREAK')
91"""Stream augmentation mark indicating a break between two otherwise contiguous
92blocks of marked events.
93
94This is used primarily by the cut() transform to provide later transforms with
95an opportunity to operate on the cut buffer.
96"""
97
98
99class PushBackStream(object):
100    """Allows a single event to be pushed back onto the stream and re-consumed.
101    """
102    def __init__(self, stream):
103        self.stream = iter(stream)
104        self.peek = None
105
106    def push(self, event):
107        assert self.peek is None
108        self.peek = event
109
110    def __iter__(self):
111        while True:
112            if self.peek is not None:
113                peek = self.peek
114                self.peek = None
115                yield peek
116            else:
117                try:
118                    event = self.stream.next()
119                    yield event
120                except StopIteration:
121                    if self.peek is None:
122                        raise
123
124
125class Transformer(object):
126    """Stream filter that can apply a variety of different transformations to
127    a stream.
128
129    This is achieved by selecting the events to be transformed using XPath,
130    then applying the transformations to the events matched by the path
131    expression. Each marked event is in the form (mark, (kind, data, pos)),
132    where mark can be any of `ENTER`, `INSIDE`, `EXIT`, `OUTSIDE`, or `None`.
133
134    The first three marks match `START` and `END` events, and any events
135    contained `INSIDE` any selected XML/HTML element. A non-element match
136    outside a `START`/`END` container (e.g. ``text()``) will yield an `OUTSIDE`
137    mark.
138
139    >>> html = HTML('<html><head><title>Some Title</title></head>'
140    ...             '<body>Some <em>body</em> text.</body></html>',
141    ...             encoding='utf-8')
142
143    Transformations act on selected stream events matching an XPath expression.
144    Here's an example of removing some markup (the title, in this case)
145    selected by an expression:
146
147    >>> print(html | Transformer('head/title').remove())
148    <html><head/><body>Some <em>body</em> text.</body></html>
149
150    Inserted content can be passed in the form of a string, or a markup event
151    stream, which includes streams generated programmatically via the
152    `builder` module:
153
154    >>> from genshi.builder import tag
155    >>> print(html | Transformer('body').prepend(tag.h1('Document Title')))
156    <html><head><title>Some Title</title></head><body><h1>Document
157    Title</h1>Some <em>body</em> text.</body></html>
158
159    Each XPath expression determines the set of tags that will be acted upon by
160    subsequent transformations. In this example we select the ``<title>`` text,
161    copy it into a buffer, then select the ``<body>`` element and paste the
162    copied text into the body as ``<h1>`` enclosed text:
163
164    >>> buffer = StreamBuffer()
165    >>> print(html | Transformer('head/title/text()').copy(buffer)
166    ...     .end().select('body').prepend(tag.h1(buffer)))
167    <html><head><title>Some Title</title></head><body><h1>Some Title</h1>Some
168    <em>body</em> text.</body></html>
169
170    Transformations can also be assigned and reused, although care must be
171    taken when using buffers, to ensure that buffers are cleared between
172    transforms:
173
174    >>> emphasis = Transformer('body//em').attr('class', 'emphasis')
175    >>> print(html | emphasis)
176    <html><head><title>Some Title</title></head><body>Some <em
177    class="emphasis">body</em> text.</body></html>
178    """
179
180    __slots__ = ['transforms']
181
182    def __init__(self, path='.'):
183        """Construct a new transformation filter.
184
185        :param path: an XPath expression (as string) or a `Path` instance
186        """
187        self.transforms = [SelectTransformation(path)]
188
189    def __call__(self, stream, keep_marks=False):
190        """Apply the transform filter to the marked stream.
191
192        :param stream: the marked event stream to filter
193        :param keep_marks: Do not strip transformer selection marks from the
194                           stream. Useful for testing.
195        :return: the transformed stream
196        :rtype: `Stream`
197        """
198        transforms = self._mark(stream)
199        for link in self.transforms:
200            transforms = link(transforms)
201        if not keep_marks:
202            transforms = self._unmark(transforms)
203        return Stream(transforms,
204                      serializer=getattr(stream, 'serializer', None))
205
206    def apply(self, function):
207        """Apply a transformation to the stream.
208
209        Transformations can be chained, similar to stream filters. Any callable
210        accepting a marked stream can be used as a transform.
211
212        As an example, here is a simple `TEXT` event upper-casing transform:
213
214        >>> def upper(stream):
215        ...     for mark, (kind, data, pos) in stream:
216        ...         if mark and kind is TEXT:
217        ...             yield mark, (kind, data.upper(), pos)
218        ...         else:
219        ...             yield mark, (kind, data, pos)
220        >>> short_stream = HTML('<body>Some <em>test</em> text</body>',
221        ...                      encoding='utf-8')
222        >>> print(short_stream | Transformer('.//em/text()').apply(upper))
223        <body>Some <em>TEST</em> text</body>
224        """
225        transformer = Transformer()
226        transformer.transforms = self.transforms[:]
227        if isinstance(function, Transformer):
228            transformer.transforms.extend(function.transforms)
229        else:
230            transformer.transforms.append(function)
231        return transformer
232
233    #{ Selection operations
234
235    def select(self, path):
236        """Mark events matching the given XPath expression, within the current
237        selection.
238
239        >>> html = HTML('<body>Some <em>test</em> text</body>', encoding='utf-8')
240        >>> print(html | Transformer().select('.//em').trace())
241        (None, ('START', (QName('body'), Attrs()), (None, 1, 0)))
242        (None, ('TEXT', u'Some ', (None, 1, 6)))
243        ('ENTER', ('START', (QName('em'), Attrs()), (None, 1, 11)))
244        ('INSIDE', ('TEXT', u'test', (None, 1, 15)))
245        ('EXIT', ('END', QName('em'), (None, 1, 19)))
246        (None, ('TEXT', u' text', (None, 1, 24)))
247        (None, ('END', QName('body'), (None, 1, 29)))
248        <body>Some <em>test</em> text</body>
249
250        :param path: an XPath expression (as string) or a `Path` instance
251        :return: the stream augmented by transformation marks
252        :rtype: `Transformer`
253        """
254        return self.apply(SelectTransformation(path))
255
256    def invert(self):
257        """Invert selection so that marked events become unmarked, and vice
258        versa.
259
260        Specificaly, all marks are converted to null marks, and all null marks
261        are converted to OUTSIDE marks.
262
263        >>> html = HTML('<body>Some <em>test</em> text</body>', encoding='utf-8')
264        >>> print(html | Transformer('//em').invert().trace())
265        ('OUTSIDE', ('START', (QName('body'), Attrs()), (None, 1, 0)))
266        ('OUTSIDE', ('TEXT', u'Some ', (None, 1, 6)))
267        (None, ('START', (QName('em'), Attrs()), (None, 1, 11)))
268        (None, ('TEXT', u'test', (None, 1, 15)))
269        (None, ('END', QName('em'), (None, 1, 19)))
270        ('OUTSIDE', ('TEXT', u' text', (None, 1, 24)))
271        ('OUTSIDE', ('END', QName('body'), (None, 1, 29)))
272        <body>Some <em>test</em> text</body>
273
274        :rtype: `Transformer`
275        """
276        return self.apply(InvertTransformation())
277
278    def end(self):
279        """End current selection, allowing all events to be selected.
280
281        Example:
282
283        >>> html = HTML('<body>Some <em>test</em> text</body>', encoding='utf-8')
284        >>> print(html | Transformer('//em').end().trace())
285        ('OUTSIDE', ('START', (QName('body'), Attrs()), (None, 1, 0)))
286        ('OUTSIDE', ('TEXT', u'Some ', (None, 1, 6)))
287        ('OUTSIDE', ('START', (QName('em'), Attrs()), (None, 1, 11)))
288        ('OUTSIDE', ('TEXT', u'test', (None, 1, 15)))
289        ('OUTSIDE', ('END', QName('em'), (None, 1, 19)))
290        ('OUTSIDE', ('TEXT', u' text', (None, 1, 24)))
291        ('OUTSIDE', ('END', QName('body'), (None, 1, 29)))
292        <body>Some <em>test</em> text</body>
293
294        :return: the stream augmented by transformation marks
295        :rtype: `Transformer`
296        """
297        return self.apply(EndTransformation())
298
299    #{ Deletion operations
300
301    def empty(self):
302        """Empty selected elements of all content.
303
304        Example:
305
306        >>> html = HTML('<html><head><title>Some Title</title></head>'
307        ...             '<body>Some <em>body</em> text.</body></html>',
308        ...             encoding='utf-8')
309        >>> print(html | Transformer('.//em').empty())
310        <html><head><title>Some Title</title></head><body>Some <em/>
311        text.</body></html>
312
313        :rtype: `Transformer`
314        """
315        return self.apply(EmptyTransformation())
316
317    def remove(self):
318        """Remove selection from the stream.
319
320        Example:
321
322        >>> html = HTML('<html><head><title>Some Title</title></head>'
323        ...             '<body>Some <em>body</em> text.</body></html>',
324        ...             encoding='utf-8')
325        >>> print(html | Transformer('.//em').remove())
326        <html><head><title>Some Title</title></head><body>Some
327        text.</body></html>
328
329        :rtype: `Transformer`
330        """
331        return self.apply(RemoveTransformation())
332
333    #{ Direct element operations
334
335    def unwrap(self):
336        """Remove outermost enclosing elements from selection.
337
338        Example:
339
340        >>> html = HTML('<html><head><title>Some Title</title></head>'
341        ...             '<body>Some <em>body</em> text.</body></html>',
342        ...             encoding='utf-8')
343        >>> print(html | Transformer('.//em').unwrap())
344        <html><head><title>Some Title</title></head><body>Some body
345        text.</body></html>
346
347        :rtype: `Transformer`
348        """
349        return self.apply(UnwrapTransformation())
350
351    def wrap(self, element):
352        """Wrap selection in an element.
353
354        >>> html = HTML('<html><head><title>Some Title</title></head>'
355        ...             '<body>Some <em>body</em> text.</body></html>',
356        ...             encoding='utf-8')
357        >>> print(html | Transformer('.//em').wrap('strong'))
358        <html><head><title>Some Title</title></head><body>Some
359        <strong><em>body</em></strong> text.</body></html>
360
361        :param element: either a tag name (as string) or an `Element` object
362        :rtype: `Transformer`
363        """
364        return self.apply(WrapTransformation(element))
365
366    #{ Content insertion operations
367
368    def replace(self, content):
369        """Replace selection with content.
370
371        >>> html = HTML('<html><head><title>Some Title</title></head>'
372        ...             '<body>Some <em>body</em> text.</body></html>',
373        ...             encoding='utf-8')
374        >>> print(html | Transformer('.//title/text()').replace('New Title'))
375        <html><head><title>New Title</title></head><body>Some <em>body</em>
376        text.</body></html>
377
378        :param content: Either a callable, an iterable of events, or a string
379                        to insert.
380        :rtype: `Transformer`
381        """
382        return self.apply(ReplaceTransformation(content))
383
384    def before(self, content):
385        """Insert content before selection.
386
387        In this example we insert the word 'emphasised' before the <em> opening
388        tag:
389
390        >>> html = HTML('<html><head><title>Some Title</title></head>'
391        ...             '<body>Some <em>body</em> text.</body></html>',
392        ...             encoding='utf-8')
393        >>> print(html | Transformer('.//em').before('emphasised '))
394        <html><head><title>Some Title</title></head><body>Some emphasised
395        <em>body</em> text.</body></html>
396
397        :param content: Either a callable, an iterable of events, or a string
398                        to insert.
399        :rtype: `Transformer`
400        """
401        return self.apply(BeforeTransformation(content))
402
403    def after(self, content):
404        """Insert content after selection.
405
406        Here, we insert some text after the </em> closing tag:
407
408        >>> html = HTML('<html><head><title>Some Title</title></head>'
409        ...             '<body>Some <em>body</em> text.</body></html>',
410        ...             encoding='utf-8')
411        >>> print(html | Transformer('.//em').after(' rock'))
412        <html><head><title>Some Title</title></head><body>Some <em>body</em>
413        rock text.</body></html>
414
415        :param content: Either a callable, an iterable of events, or a string
416                        to insert.
417        :rtype: `Transformer`
418        """
419        return self.apply(AfterTransformation(content))
420
421    def prepend(self, content):
422        """Insert content after the ENTER event of the selection.
423
424        Inserting some new text at the start of the <body>:
425
426        >>> html = HTML('<html><head><title>Some Title</title></head>'
427        ...             '<body>Some <em>body</em> text.</body></html>',
428        ...             encoding='utf-8')
429        >>> print(html | Transformer('.//body').prepend('Some new body text. '))
430        <html><head><title>Some Title</title></head><body>Some new body text.
431        Some <em>body</em> text.</body></html>
432
433        :param content: Either a callable, an iterable of events, or a string
434                        to insert.
435        :rtype: `Transformer`
436        """
437        return self.apply(PrependTransformation(content))
438
439    def append(self, content):
440        """Insert content before the END event of the selection.
441
442        >>> html = HTML('<html><head><title>Some Title</title></head>'
443        ...             '<body>Some <em>body</em> text.</body></html>',
444        ...             encoding='utf-8')
445        >>> print(html | Transformer('.//body').append(' Some new body text.'))
446        <html><head><title>Some Title</title></head><body>Some <em>body</em>
447        text. Some new body text.</body></html>
448
449        :param content: Either a callable, an iterable of events, or a string
450                        to insert.
451        :rtype: `Transformer`
452        """
453        return self.apply(AppendTransformation(content))
454
455    #{ Attribute manipulation
456
457    def attr(self, name, value):
458        """Add, replace or delete an attribute on selected elements.
459
460        If `value` evaulates to `None` the attribute will be deleted from the
461        element:
462
463        >>> html = HTML('<html><head><title>Some Title</title></head>'
464        ...             '<body>Some <em class="before">body</em> <em>text</em>.</body>'
465        ...             '</html>', encoding='utf-8')
466        >>> print(html | Transformer('body/em').attr('class', None))
467        <html><head><title>Some Title</title></head><body>Some <em>body</em>
468        <em>text</em>.</body></html>
469
470        Otherwise the attribute will be set to `value`:
471
472        >>> print(html | Transformer('body/em').attr('class', 'emphasis'))
473        <html><head><title>Some Title</title></head><body>Some <em
474        class="emphasis">body</em> <em class="emphasis">text</em>.</body></html>
475
476        If `value` is a callable it will be called with the attribute name and
477        the `START` event for the matching element. Its return value will then
478        be used to set the attribute:
479
480        >>> def print_attr(name, event):
481        ...     attrs = event[1][1]
482        ...     print(attrs)
483        ...     return attrs.get(name)
484        >>> print(html | Transformer('body/em').attr('class', print_attr))
485        Attrs([(QName('class'), u'before')])
486        Attrs()
487        <html><head><title>Some Title</title></head><body>Some <em
488        class="before">body</em> <em>text</em>.</body></html>
489
490        :param name: the name of the attribute
491        :param value: the value that should be set for the attribute.
492        :rtype: `Transformer`
493        """
494        return self.apply(AttrTransformation(name, value))
495
496    #{ Buffer operations
497
498    def copy(self, buffer, accumulate=False):
499        """Copy selection into buffer.
500
501        The buffer is replaced by each *contiguous* selection before being passed
502        to the next transformation. If accumulate=True, further selections will
503        be appended to the buffer rather than replacing it.
504
505        >>> from genshi.builder import tag
506        >>> buffer = StreamBuffer()
507        >>> html = HTML('<html><head><title>Some Title</title></head>'
508        ...             '<body>Some <em>body</em> text.</body></html>',
509        ...             encoding='utf-8')
510        >>> print(html | Transformer('head/title/text()').copy(buffer)
511        ...     .end().select('body').prepend(tag.h1(buffer)))
512        <html><head><title>Some Title</title></head><body><h1>Some
513        Title</h1>Some <em>body</em> text.</body></html>
514
515        This example illustrates that only a single contiguous selection will
516        be buffered:
517
518        >>> print(html | Transformer('head/title/text()').copy(buffer)
519        ...     .end().select('body/em').copy(buffer).end().select('body')
520        ...     .prepend(tag.h1(buffer)))
521        <html><head><title>Some Title</title></head><body><h1>Some
522        Title</h1>Some <em>body</em> text.</body></html>
523        >>> print(buffer)
524        <em>body</em>
525
526        Element attributes can also be copied for later use:
527
528        >>> html = HTML('<html><head><title>Some Title</title></head>'
529        ...             '<body><em>Some</em> <em class="before">body</em>'
530        ...             '<em>text</em>.</body></html>',
531        ...             encoding='utf-8')
532        >>> buffer = StreamBuffer()
533        >>> def apply_attr(name, entry):
534        ...     return list(buffer)[0][1][1].get('class')
535        >>> print(html | Transformer('body/em[@class]/@class').copy(buffer)
536        ...     .end().buffer().select('body/em[not(@class)]')
537        ...     .attr('class', apply_attr))
538        <html><head><title>Some Title</title></head><body><em
539        class="before">Some</em> <em class="before">body</em><em
540        class="before">text</em>.</body></html>
541
542
543        :param buffer: the `StreamBuffer` in which the selection should be
544                       stored
545        :rtype: `Transformer`
546        :note: Copy (and cut) copy each individual selected object into the
547               buffer before passing to the next transform. For example, the
548               XPath ``*|text()`` will select all elements and text, each
549               instance of which will be copied to the buffer individually
550               before passing to the next transform. This has implications for
551               how ``StreamBuffer`` objects can be used, so some
552               experimentation may be required.
553
554        """
555        return self.apply(CopyTransformation(buffer, accumulate))
556
557    def cut(self, buffer, accumulate=False):
558        """Copy selection into buffer and remove the selection from the stream.
559
560        >>> from genshi.builder import tag
561        >>> buffer = StreamBuffer()
562        >>> html = HTML('<html><head><title>Some Title</title></head>'
563        ...             '<body>Some <em>body</em> text.</body></html>',
564        ...             encoding='utf-8')
565        >>> print(html | Transformer('.//em/text()').cut(buffer)
566        ...     .end().select('.//em').after(tag.h1(buffer)))
567        <html><head><title>Some Title</title></head><body>Some
568        <em/><h1>body</h1> text.</body></html>
569
570        Specifying accumulate=True, appends all selected intervals onto the
571        buffer. Combining this with the .buffer() operation allows us operate
572        on all copied events rather than per-segment. See the documentation on
573        buffer() for more information.
574
575        :param buffer: the `StreamBuffer` in which the selection should be
576                       stored
577        :rtype: `Transformer`
578        :note: this transformation will buffer the entire input stream
579        """
580        return self.apply(CutTransformation(buffer, accumulate))
581
582    def buffer(self):
583        """Buffer the entire stream (can consume a considerable amount of
584        memory).
585
586        Useful in conjunction with copy(accumulate=True) and
587        cut(accumulate=True) to ensure that all marked events in the entire
588        stream are copied to the buffer before further transformations are
589        applied.
590
591        For example, to move all <note> elements inside a <notes> tag at the
592        top of the document:
593
594        >>> doc = HTML('<doc><notes></notes><body>Some <note>one</note> '
595        ...            'text <note>two</note>.</body></doc>',
596        ...             encoding='utf-8')
597        >>> buffer = StreamBuffer()
598        >>> print(doc | Transformer('body/note').cut(buffer, accumulate=True)
599        ...     .end().buffer().select('notes').prepend(buffer))
600        <doc><notes><note>one</note><note>two</note></notes><body>Some  text
601        .</body></doc>
602
603        """
604        return self.apply(list)
605
606    #{ Miscellaneous operations
607
608    def filter(self, filter):
609        """Apply a normal stream filter to the selection. The filter is called
610        once for each contiguous block of marked events.
611
612        >>> from genshi.filters.html import HTMLSanitizer
613        >>> html = HTML('<html><body>Some text<script>alert(document.cookie)'
614        ...             '</script> and some more text</body></html>',
615        ...             encoding='utf-8')
616        >>> print(html | Transformer('body/*').filter(HTMLSanitizer()))
617        <html><body>Some text and some more text</body></html>
618
619        :param filter: The stream filter to apply.
620        :rtype: `Transformer`
621        """
622        return self.apply(FilterTransformation(filter))
623
624    def map(self, function, kind):
625        """Applies a function to the ``data`` element of events of ``kind`` in
626        the selection.
627
628        >>> html = HTML('<html><head><title>Some Title</title></head>'
629        ...               '<body>Some <em>body</em> text.</body></html>',
630        ...             encoding='utf-8')
631        >>> print(html | Transformer('head/title').map(unicode.upper, TEXT))
632        <html><head><title>SOME TITLE</title></head><body>Some <em>body</em>
633        text.</body></html>
634
635        :param function: the function to apply
636        :param kind: the kind of event the function should be applied to
637        :rtype: `Transformer`
638        """
639        return self.apply(MapTransformation(function, kind))
640
641    def substitute(self, pattern, replace, count=1):
642        """Replace text matching a regular expression.
643
644        Refer to the documentation for ``re.sub()`` for details.
645
646        >>> html = HTML('<html><body>Some text, some more text and '
647        ...             '<b>some bold text</b>\\n'
648        ...             '<i>some italicised text</i></body></html>',
649        ...             encoding='utf-8')
650        >>> print(html | Transformer('body/b').substitute('(?i)some', 'SOME'))
651        <html><body>Some text, some more text and <b>SOME bold text</b>
652        <i>some italicised text</i></body></html>
653        >>> tags = tag.html(tag.body('Some text, some more text and\\n',
654        ...      Markup('<b>some bold text</b>')))
655        >>> print(tags.generate() | Transformer('body').substitute(
656        ...     '(?i)some', 'SOME'))
657        <html><body>SOME text, some more text and
658        <b>SOME bold text</b></body></html>
659
660        :param pattern: A regular expression object or string.
661        :param replace: Replacement pattern.
662        :param count: Number of replacements to make in each text fragment.
663        :rtype: `Transformer`
664        """
665        return self.apply(SubstituteTransformation(pattern, replace, count))
666
667    def rename(self, name):
668        """Rename matching elements.
669
670        >>> html = HTML('<html><body>Some text, some more text and '
671        ...             '<b>some bold text</b></body></html>',
672        ...             encoding='utf-8')
673        >>> print(html | Transformer('body/b').rename('strong'))
674        <html><body>Some text, some more text and <strong>some bold text</strong></body></html>
675        """
676        return self.apply(RenameTransformation(name))
677
678    def trace(self, prefix='', fileobj=None):
679        """Print events as they pass through the transform.
680
681        >>> html = HTML('<body>Some <em>test</em> text</body>', encoding='utf-8')
682        >>> print(html | Transformer('em').trace())
683        (None, ('START', (QName('body'), Attrs()), (None, 1, 0)))
684        (None, ('TEXT', u'Some ', (None, 1, 6)))
685        ('ENTER', ('START', (QName('em'), Attrs()), (None, 1, 11)))
686        ('INSIDE', ('TEXT', u'test', (None, 1, 15)))
687        ('EXIT', ('END', QName('em'), (None, 1, 19)))
688        (None, ('TEXT', u' text', (None, 1, 24)))
689        (None, ('END', QName('body'), (None, 1, 29)))
690        <body>Some <em>test</em> text</body>
691
692        :param prefix: a string to prefix each event with in the output
693        :param fileobj: the writable file-like object to write to; defaults to
694                        the standard output stream
695        :rtype: `Transformer`
696        """
697        return self.apply(TraceTransformation(prefix, fileobj=fileobj))
698
699    # Internal methods
700
701    def _mark(self, stream):
702        for event in stream:
703            yield OUTSIDE, event
704
705    def _unmark(self, stream):
706        for mark, event in stream:
707            kind = event[0]
708            if not (kind is None or kind is ATTR or kind is BREAK):
709                yield event
710
711
712class SelectTransformation(object):
713    """Select and mark events that match an XPath expression."""
714
715    def __init__(self, path):
716        """Create selection.
717
718        :param path: an XPath expression (as string) or a `Path` object
719        """
720        if not isinstance(path, Path):
721            path = Path(path)
722        self.path = path
723
724    def __call__(self, stream):
725        """Apply the transform filter to the marked stream.
726
727        :param stream: the marked event stream to filter
728        """
729        namespaces = {}
730        variables = {}
731        test = self.path.test()
732        stream = iter(stream)
733        next = stream.next
734        for mark, event in stream:
735            if mark is None:
736                yield mark, event
737                continue
738            result = test(event, namespaces, variables)
739            # XXX This is effectively genshi.core._ensure() for transform
740            # streams.
741            if result is True:
742                if event[0] is START:
743                    yield ENTER, event
744                    depth = 1
745                    while depth > 0:
746                        mark, subevent = next()
747                        if subevent[0] is START:
748                            depth += 1
749                        elif subevent[0] is END:
750                            depth -= 1
751                        if depth == 0:
752                            yield EXIT, subevent
753                        else:
754                            yield INSIDE, subevent
755                        test(subevent, namespaces, variables, updateonly=True)
756                else:
757                    yield OUTSIDE, event
758            elif isinstance(result, Attrs):
759                # XXX  Selected *attributes* are given a "kind" of None to
760                # indicate they are not really part of the stream.
761                yield ATTR, (ATTR, (QName(event[1][0] + '@*'), result), event[2])
762                yield None, event
763            elif isinstance(result, tuple):
764                yield OUTSIDE, result
765            elif result:
766                # XXX Assume everything else is "text"?
767                yield None, (TEXT, unicode(result), (None, -1, -1))
768            else:
769                yield None, event
770
771
772class InvertTransformation(object):
773    """Invert selection so that marked events become unmarked, and vice versa.
774
775    Specificaly, all input marks are converted to null marks, and all input
776    null marks are converted to OUTSIDE marks.
777    """
778
779    def __call__(self, stream):
780        """Apply the transform filter to the marked stream.
781
782        :param stream: the marked event stream to filter
783        """
784        for mark, event in stream:
785            if mark:
786                yield None, event
787            else:
788                yield OUTSIDE, event
789
790
791class EndTransformation(object):
792    """End the current selection."""
793
794    def __call__(self, stream):
795        """Apply the transform filter to the marked stream.
796
797        :param stream: the marked event stream to filter
798        """
799        for mark, event in stream:
800            yield OUTSIDE, event
801
802
803class EmptyTransformation(object):
804    """Empty selected elements of all content."""
805
806    def __call__(self, stream):
807        """Apply the transform filter to the marked stream.
808
809        :param stream: the marked event stream to filter
810        """
811        for mark, event in stream:
812            yield mark, event
813            if mark is ENTER:
814                for mark, event in stream:
815                    if mark is EXIT:
816                        yield mark, event
817                        break
818
819
820class RemoveTransformation(object):
821    """Remove selection from the stream."""
822
823    def __call__(self, stream):
824        """Apply the transform filter to the marked stream.
825
826        :param stream: the marked event stream to filter
827        """
828        for mark, event in stream:
829            if mark is None:
830                yield mark, event
831
832
833class UnwrapTransformation(object):
834    """Remove outtermost enclosing elements from selection."""
835
836    def __call__(self, stream):
837        """Apply the transform filter to the marked stream.
838
839        :param stream: the marked event stream to filter
840        """
841        for mark, event in stream:
842            if mark not in (ENTER, EXIT):
843                yield mark, event
844
845
846class WrapTransformation(object):
847    """Wrap selection in an element."""
848
849    def __init__(self, element):
850        if isinstance(element, Element):
851            self.element = element
852        else:
853            self.element = Element(element)
854
855    def __call__(self, stream):
856        for mark, event in stream:
857            if mark:
858                element = list(self.element.generate())
859                for prefix in element[:-1]:
860                    yield None, prefix
861                yield mark, event
862                start = mark
863                stopped = False
864                for mark, event in stream:
865                    if start is ENTER and mark is EXIT:
866                        yield mark, event
867                        stopped = True
868                        break
869                    if not mark:
870                        break
871                    yield mark, event
872                else:
873                    stopped = True
874                yield None, element[-1]
875                if not stopped:
876                    yield mark, event
877            else:
878                yield mark, event
879
880
881class TraceTransformation(object):
882    """Print events as they pass through the transform."""
883
884    def __init__(self, prefix='', fileobj=None):
885        """Trace constructor.
886
887        :param prefix: text to prefix each traced line with.
888        :param fileobj: the writable file-like object to write to
889        """
890        self.prefix = prefix
891        self.fileobj = fileobj or sys.stdout
892
893    def __call__(self, stream):
894        """Apply the transform filter to the marked stream.
895
896        :param stream: the marked event stream to filter
897        """
898        for event in stream:
899            self.fileobj.write('%s%s\n' % (self.prefix, event))
900            yield event
901
902
903class FilterTransformation(object):
904    """Apply a normal stream filter to the selection. The filter is called once
905    for each selection."""
906
907    def __init__(self, filter):
908        """Create the transform.
909
910        :param filter: The stream filter to apply.
911        """
912        self.filter = filter
913
914    def __call__(self, stream):
915        """Apply the transform filter to the marked stream.
916
917        :param stream: The marked event stream to filter
918        """
919        def flush(queue):
920            if queue:
921                for event in self.filter(queue):
922                    yield OUTSIDE, event
923                del queue[:]
924
925        queue = []
926        for mark, event in stream:
927            if mark is ENTER:
928                queue.append(event)
929                for mark, event in stream:
930                    queue.append(event)
931                    if mark is EXIT:
932                        break
933                for queue_event in flush(queue):
934                    yield queue_event
935            elif mark is OUTSIDE:
936                stopped = False
937                queue.append(event)
938                for mark, event in stream:
939                    if mark is not OUTSIDE:
940                        break
941                    queue.append(event)
942                else:
943                    stopped = True
944                for queue_event in flush(queue):
945                    yield queue_event
946                if not stopped:
947                    yield mark, event
948            else:
949                yield mark, event
950        for queue_event in flush(queue):
951            yield queue_event
952
953
954class MapTransformation(object):
955    """Apply a function to the `data` element of events of ``kind`` in the
956    selection.
957    """
958
959    def __init__(self, function, kind):
960        """Create the transform.
961
962        :param function: the function to apply; the function must take one
963                         argument, the `data` element of each selected event
964        :param kind: the stream event ``kind`` to apply the `function` to
965        """
966        self.function = function
967        self.kind = kind
968
969    def __call__(self, stream):
970        """Apply the transform filter to the marked stream.
971
972        :param stream: The marked event stream to filter
973        """
974        for mark, (kind, data, pos) in stream:
975            if mark and self.kind in (None, kind):
976                yield mark, (kind, self.function(data), pos)
977            else:
978                yield mark, (kind, data, pos)
979
980
981class SubstituteTransformation(object):
982    """Replace text matching a regular expression.
983
984    Refer to the documentation for ``re.sub()`` for details.
985    """
986    def __init__(self, pattern, replace, count=0):
987        """Create the transform.
988
989        :param pattern: A regular expression object, or string.
990        :param replace: Replacement pattern.
991        :param count: Number of replacements to make in each text fragment.
992        """
993        if isinstance(pattern, basestring):
994            self.pattern = re.compile(pattern)
995        else:
996            self.pattern = pattern
997        self.count = count
998        self.replace = replace
999
1000    def __call__(self, stream):
1001        """Apply the transform filter to the marked stream.
1002
1003        :param stream: The marked event stream to filter
1004        """
1005        for mark, (kind, data, pos) in stream:
1006            if mark is not None and kind is TEXT:
1007                new_data = self.pattern.sub(self.replace, data, self.count)
1008                if isinstance(data, Markup):
1009                    data = Markup(new_data)
1010                else:
1011                    data = new_data
1012            yield mark, (kind, data, pos)
1013
1014
1015class RenameTransformation(object):
1016    """Rename matching elements."""
1017    def __init__(self, name):
1018        """Create the transform.
1019
1020        :param name: New element name.
1021        """
1022        self.name = QName(name)
1023
1024    def __call__(self, stream):
1025        """Apply the transform filter to the marked stream.
1026
1027        :param stream: The marked event stream to filter
1028        """
1029        for mark, (kind, data, pos) in stream:
1030            if mark is ENTER:
1031                data = self.name, data[1]
1032            elif mark is EXIT:
1033                data = self.name
1034            yield mark, (kind, data, pos)
1035
1036
1037class InjectorTransformation(object):
1038    """Abstract base class for transformations that inject content into a
1039    stream.
1040
1041    >>> class Top(InjectorTransformation):
1042    ...     def __call__(self, stream):
1043    ...         for event in self._inject():
1044    ...             yield event
1045    ...         for event in stream:
1046    ...             yield event
1047    >>> html = HTML('<body>Some <em>test</em> text</body>', encoding='utf-8')
1048    >>> print(html | Transformer('.//em').apply(Top('Prefix ')))
1049    Prefix <body>Some <em>test</em> text</body>
1050    """
1051    def __init__(self, content):
1052        """Create a new injector.
1053
1054        :param content: An iterable of Genshi stream events, or a string to be
1055                        injected.
1056        """
1057        self.content = content
1058
1059    def _inject(self):
1060        content = self.content
1061        if hasattr(content, '__call__'):
1062            content = content()
1063        for event in _ensure(content):
1064            yield None, event
1065
1066
1067class ReplaceTransformation(InjectorTransformation):
1068    """Replace selection with content."""
1069
1070    def __call__(self, stream):
1071        """Apply the transform filter to the marked stream.
1072
1073        :param stream: The marked event stream to filter
1074        """
1075        stream = PushBackStream(stream)
1076        for mark, event in stream:
1077            if mark is not None:
1078                start = mark
1079                for subevent in self._inject():
1080                    yield subevent
1081                for mark, event in stream:
1082                    if start is ENTER:
1083                        if mark is EXIT:
1084                            break
1085                    elif mark != start:
1086                        stream.push((mark, event))
1087                        break
1088            else:
1089                yield mark, event
1090
1091
1092class BeforeTransformation(InjectorTransformation):
1093    """Insert content before selection."""
1094
1095    def __call__(self, stream):
1096        """Apply the transform filter to the marked stream.
1097
1098        :param stream: The marked event stream to filter
1099        """
1100        stream = PushBackStream(stream)
1101        for mark, event in stream:
1102            if mark is not None:
1103                start = mark
1104                for subevent in self._inject():
1105                    yield subevent
1106                yield mark, event
1107                for mark, event in stream:
1108                    if mark != start and start is not ENTER:
1109                        stream.push((mark, event))
1110                        break
1111                    yield mark, event
1112                    if start is ENTER and mark is EXIT:
1113                        break
1114            else:
1115                yield mark, event
1116
1117
1118class AfterTransformation(InjectorTransformation):
1119    """Insert content after selection."""
1120
1121    def __call__(self, stream):
1122        """Apply the transform filter to the marked stream.
1123
1124        :param stream: The marked event stream to filter
1125        """
1126        stream = PushBackStream(stream)
1127        for mark, event in stream:
1128            yield mark, event
1129            if mark:
1130                start = mark
1131                for mark, event in stream:
1132                    if start is not ENTER and mark != start:
1133                        stream.push((mark, event))
1134                        break
1135                    yield mark, event
1136                    if start is ENTER and mark is EXIT:
1137                        break
1138                for subevent in self._inject():
1139                    yield subevent
1140
1141
1142class PrependTransformation(InjectorTransformation):
1143    """Prepend content to the inside of selected elements."""
1144
1145    def __call__(self, stream):
1146        """Apply the transform filter to the marked stream.
1147
1148        :param stream: The marked event stream to filter
1149        """
1150        for mark, event in stream:
1151            yield mark, event
1152            if mark is ENTER:
1153                for subevent in self._inject():
1154                    yield subevent
1155
1156
1157class AppendTransformation(InjectorTransformation):
1158    """Append content after the content of selected elements."""
1159
1160    def __call__(self, stream):
1161        """Apply the transform filter to the marked stream.
1162
1163        :param stream: The marked event stream to filter
1164        """
1165        for mark, event in stream:
1166            yield mark, event
1167            if mark is ENTER:
1168                for mark, event in stream:
1169                    if mark is EXIT:
1170                        break
1171                    yield mark, event
1172                for subevent in self._inject():
1173                    yield subevent
1174                yield mark, event
1175
1176
1177class AttrTransformation(object):
1178    """Set an attribute on selected elements."""
1179
1180    def __init__(self, name, value):
1181        """Construct transform.
1182
1183        :param name: name of the attribute that should be set
1184        :param value: the value to set
1185        """
1186        self.name = name
1187        self.value = value
1188
1189    def __call__(self, stream):
1190        """Apply the transform filter to the marked stream.
1191
1192        :param stream: The marked event stream to filter
1193        """
1194        callable_value = hasattr(self.value, '__call__')
1195        for mark, (kind, data, pos) in stream:
1196            if mark is ENTER:
1197                if callable_value:
1198                    value = self.value(self.name, (kind, data, pos))
1199                else:
1200                    value = self.value
1201                if value is None:
1202                    attrs = data[1] - [QName(self.name)]
1203                else:
1204                    attrs = data[1] | [(QName(self.name), value)]
1205                data = (data[0], attrs)
1206            yield mark, (kind, data, pos)
1207
1208
1209
1210class StreamBuffer(Stream):
1211    """Stream event buffer used for cut and copy transformations."""
1212
1213    def __init__(self):
1214        """Create the buffer."""
1215        Stream.__init__(self, [])
1216
1217    def append(self, event):
1218        """Add an event to the buffer.
1219
1220        :param event: the markup event to add
1221        """
1222        self.events.append(event)
1223
1224    def reset(self):
1225        """Empty the buffer of events."""
1226        del self.events[:]
1227
1228
1229class CopyTransformation(object):
1230    """Copy selected events into a buffer for later insertion."""
1231
1232    def __init__(self, buffer, accumulate=False):
1233        """Create the copy transformation.
1234
1235        :param buffer: the `StreamBuffer` in which the selection should be
1236                       stored
1237        """
1238        if not accumulate:
1239            buffer.reset()
1240        self.buffer = buffer
1241        self.accumulate = accumulate
1242
1243    def __call__(self, stream):
1244        """Apply the transformation to the marked stream.
1245
1246        :param stream: the marked event stream to filter
1247        """
1248        stream = PushBackStream(stream)
1249
1250        for mark, event in stream:
1251            if mark:
1252                if not self.accumulate:
1253                    self.buffer.reset()
1254                events = [(mark, event)]
1255                self.buffer.append(event)
1256                start = mark
1257                for mark, event in stream:
1258                    if start is not ENTER and mark != start:
1259                        stream.push((mark, event))
1260                        break
1261                    events.append((mark, event))
1262                    self.buffer.append(event)
1263                    if start is ENTER and mark is EXIT:
1264                        break
1265                for i in events:
1266                    yield i
1267            else:
1268                yield mark, event
1269
1270
1271class CutTransformation(object):
1272    """Cut selected events into a buffer for later insertion and remove the
1273    selection.
1274    """
1275
1276    def __init__(self, buffer, accumulate=False):
1277        """Create the cut transformation.
1278
1279        :param buffer: the `StreamBuffer` in which the selection should be
1280                       stored
1281        """
1282        self.buffer = buffer
1283        self.accumulate = accumulate
1284
1285
1286    def __call__(self, stream):
1287        """Apply the transform filter to the marked stream.
1288
1289        :param stream: the marked event stream to filter
1290        """
1291        attributes = []
1292        stream = PushBackStream(stream)
1293        broken = False
1294        if not self.accumulate:
1295            self.buffer.reset()
1296        for mark, event in stream:
1297            if mark:
1298                # Send a BREAK event if there was no other event sent between
1299                if not self.accumulate:
1300                    if not broken and self.buffer:
1301                        yield BREAK, (BREAK, None, None)
1302                    self.buffer.reset()
1303                self.buffer.append(event)
1304                start = mark
1305                if mark is ATTR:
1306                    attributes.extend([name for name, _ in event[1][1]])
1307                for mark, event in stream:
1308                    if start is mark is ATTR:
1309                        attributes.extend([name for name, _ in event[1][1]])
1310                    # Handle non-element contiguous selection
1311                    if start is not ENTER and mark != start:
1312                        # Operating on the attributes of a START event
1313                        if start is ATTR:
1314                            kind, data, pos = event
1315                            assert kind is START
1316                            data = (data[0], data[1] - attributes)
1317                            attributes = None
1318                            stream.push((mark, (kind, data, pos)))
1319                        else:
1320                            stream.push((mark, event))
1321                        break
1322                    self.buffer.append(event)
1323                    if start is ENTER and mark is EXIT:
1324                        break
1325                broken = False
1326            else:
1327                broken = True
1328                yield mark, event
1329        if not broken and self.buffer:
1330            yield BREAK, (BREAK, None, None)
Note: See TracBrowser for help on using the repository browser.