Edgewall Software

source: branches/stable/0.5.x/genshi/util.py

Last change on this file was 771, checked in by cmlenz, 16 years ago

The striptags function now also removes HTML/XML-style comments. Closes #150. Thanks to Armin Ronacher for the report and suggested fix.

  • Property svn:eol-style set to native
File size: 6.8 KB
Line 
1# -*- coding: utf-8 -*-
2#
3# Copyright (C) 2006-2007 Edgewall Software
4# All rights reserved.
5#
6# This software is licensed as described in the file COPYING, which
7# you should have received as part of this distribution. The terms
8# are also available at http://genshi.edgewall.org/wiki/License.
9#
10# This software consists of voluntary contributions made by many
11# individuals. For the exact contribution history, see the revision
12# history and logs, available at http://genshi.edgewall.org/log/.
13
14"""Various utility classes and functions."""
15
16import htmlentitydefs
17import re
18try:
19    set
20except NameError:
21    from sets import ImmutableSet as frozenset
22    from sets import Set as set
23
24__docformat__ = 'restructuredtext en'
25
26
27class LRUCache(dict):
28    """A dictionary-like object that stores only a certain number of items, and
29    discards its least recently used item when full.
30   
31    >>> cache = LRUCache(3)
32    >>> cache['A'] = 0
33    >>> cache['B'] = 1
34    >>> cache['C'] = 2
35    >>> len(cache)
36    3
37   
38    >>> cache['A']
39    0
40   
41    Adding new items to the cache does not increase its size. Instead, the least
42    recently used item is dropped:
43   
44    >>> cache['D'] = 3
45    >>> len(cache)
46    3
47    >>> 'B' in cache
48    False
49   
50    Iterating over the cache returns the keys, starting with the most recently
51    used:
52   
53    >>> for key in cache:
54    ...     print key
55    D
56    A
57    C
58
59    This code is based on the LRUCache class from ``myghtyutils.util``, written
60    by Mike Bayer and released under the MIT license. See:
61
62      http://svn.myghty.org/myghtyutils/trunk/lib/myghtyutils/util.py
63    """
64
65    class _Item(object):
66        def __init__(self, key, value):
67            self.previous = self.next = None
68            self.key = key
69            self.value = value
70        def __repr__(self):
71            return repr(self.value)
72
73    def __init__(self, capacity):
74        self._dict = dict()
75        self.capacity = capacity
76        self.head = None
77        self.tail = None
78
79    def __contains__(self, key):
80        return key in self._dict
81
82    def __iter__(self):
83        cur = self.head
84        while cur:
85            yield cur.key
86            cur = cur.next
87
88    def __len__(self):
89        return len(self._dict)
90
91    def __getitem__(self, key):
92        item = self._dict[key]
93        self._update_item(item)
94        return item.value
95
96    def __setitem__(self, key, value):
97        item = self._dict.get(key)
98        if item is None:
99            item = self._Item(key, value)
100            self._dict[key] = item
101            self._insert_item(item)
102        else:
103            item.value = value
104            self._update_item(item)
105            self._manage_size()
106
107    def __repr__(self):
108        return repr(self._dict)
109
110    def _insert_item(self, item):
111        item.previous = None
112        item.next = self.head
113        if self.head is not None:
114            self.head.previous = item
115        else:
116            self.tail = item
117        self.head = item
118        self._manage_size()
119
120    def _manage_size(self):
121        while len(self._dict) > self.capacity:
122            olditem = self._dict[self.tail.key]
123            del self._dict[self.tail.key]
124            if self.tail != self.head:
125                self.tail = self.tail.previous
126                self.tail.next = None
127            else:
128                self.head = self.tail = None
129
130    def _update_item(self, item):
131        if self.head == item:
132            return
133
134        previous = item.previous
135        previous.next = item.next
136        if item.next is not None:
137            item.next.previous = previous
138        else:
139            self.tail = previous
140
141        item.previous = None
142        item.next = self.head
143        self.head.previous = self.head = item
144
145
146def flatten(items):
147    """Flattens a potentially nested sequence into a flat list.
148   
149    :param items: the sequence to flatten
150   
151    >>> flatten((1, 2))
152    [1, 2]
153    >>> flatten([1, (2, 3), 4])
154    [1, 2, 3, 4]
155    >>> flatten([1, (2, [3, 4]), 5])
156    [1, 2, 3, 4, 5]
157    """
158    retval = []
159    for item in items:
160        if isinstance(item, (frozenset, list, set, tuple)):
161            retval += flatten(item)
162        else:
163            retval.append(item)
164    return retval
165
166def plaintext(text, keeplinebreaks=True):
167    """Returns the text as a `unicode` string with all entities and tags
168    removed.
169   
170    >>> plaintext('<b>1 &lt; 2</b>')
171    u'1 < 2'
172   
173    The `keeplinebreaks` parameter can be set to ``False`` to replace any line
174    breaks by simple spaces:
175   
176    >>> plaintext('''<b>1
177    ... &lt;
178    ... 2</b>''', keeplinebreaks=False)
179    u'1 < 2'
180   
181    :param text: the text to convert to plain text
182    :param keeplinebreaks: whether line breaks in the text should be kept intact
183    :return: the text with tags and entities removed
184    """
185    text = stripentities(striptags(text))
186    if not keeplinebreaks:
187        text = text.replace(u'\n', u' ')
188    return text
189
190_STRIPENTITIES_RE = re.compile(r'&(?:#((?:\d+)|(?:[xX][0-9a-fA-F]+));?|(\w+);)')
191def stripentities(text, keepxmlentities=False):
192    """Return a copy of the given text with any character or numeric entities
193    replaced by the equivalent UTF-8 characters.
194   
195    >>> stripentities('1 &lt; 2')
196    u'1 < 2'
197    >>> stripentities('more &hellip;')
198    u'more \u2026'
199    >>> stripentities('&#8230;')
200    u'\u2026'
201    >>> stripentities('&#x2026;')
202    u'\u2026'
203   
204    If the `keepxmlentities` parameter is provided and is a truth value, the
205    core XML entities (&amp;, &apos;, &gt;, &lt; and &quot;) are left intact.
206
207    >>> stripentities('1 &lt; 2 &hellip;', keepxmlentities=True)
208    u'1 &lt; 2 \u2026'
209    """
210    def _replace_entity(match):
211        if match.group(1): # numeric entity
212            ref = match.group(1)
213            if ref.startswith('x'):
214                ref = int(ref[1:], 16)
215            else:
216                ref = int(ref, 10)
217            return unichr(ref)
218        else: # character entity
219            ref = match.group(2)
220            if keepxmlentities and ref in ('amp', 'apos', 'gt', 'lt', 'quot'):
221                return u'&%s;' % ref
222            try:
223                return unichr(htmlentitydefs.name2codepoint[ref])
224            except KeyError:
225                if keepxmlentities:
226                    return u'&amp;%s;' % ref
227                else:
228                    return ref
229    return _STRIPENTITIES_RE.sub(_replace_entity, text)
230
231_STRIPTAGS_RE = re.compile(r'(<!--.*?-->|<[^>]*>)')
232def striptags(text):
233    """Return a copy of the text with any XML/HTML tags removed.
234   
235    >>> striptags('<span>Foo</span> bar')
236    'Foo bar'
237    >>> striptags('<span class="bar">Foo</span>')
238    'Foo'
239    >>> striptags('Foo<br />')
240    'Foo'
241   
242    HTML/XML comments are stripped, too:
243   
244    >>> striptags('<!-- <blub>hehe</blah> -->test')
245    'test'
246   
247    :param text: the string to remove tags from
248    :return: the text with tags removed
249    """
250    return _STRIPTAGS_RE.sub('', text)
Note: See TracBrowser for help on using the repository browser.