Edgewall Software

source: branches/stable/0.6.x/genshi/util.py

Last change on this file was 1082, checked in by cmlenz, 14 years ago

More bits of 2to3 related cleanup.

  • Property svn:eol-style set to native
File size: 7.2 KB
Line 
1# -*- coding: utf-8 -*-
2#
3# Copyright (C) 2006-2009 Edgewall Software
4# All rights reserved.
5#
6# This software is licensed as described in the file COPYING, which
7# you should have received as part of this distribution. The terms
8# are also available at http://genshi.edgewall.org/wiki/License.
9#
10# This software consists of voluntary contributions made by many
11# individuals. For the exact contribution history, see the revision
12# history and logs, available at http://genshi.edgewall.org/log/.
13
14"""Various utility classes and functions."""
15
16import htmlentitydefs as entities
17import re
18
19__docformat__ = 'restructuredtext en'
20
21
22class LRUCache(dict):
23    """A dictionary-like object that stores only a certain number of items, and
24    discards its least recently used item when full.
25   
26    >>> cache = LRUCache(3)
27    >>> cache['A'] = 0
28    >>> cache['B'] = 1
29    >>> cache['C'] = 2
30    >>> len(cache)
31    3
32   
33    >>> cache['A']
34    0
35   
36    Adding new items to the cache does not increase its size. Instead, the least
37    recently used item is dropped:
38   
39    >>> cache['D'] = 3
40    >>> len(cache)
41    3
42    >>> 'B' in cache
43    False
44   
45    Iterating over the cache returns the keys, starting with the most recently
46    used:
47   
48    >>> for key in cache:
49    ...     print(key)
50    D
51    A
52    C
53
54    This code is based on the LRUCache class from ``myghtyutils.util``, written
55    by Mike Bayer and released under the MIT license. See:
56
57      http://svn.myghty.org/myghtyutils/trunk/lib/myghtyutils/util.py
58    """
59
60    class _Item(object):
61        def __init__(self, key, value):
62            self.prv = self.nxt = None
63            self.key = key
64            self.value = value
65        def __repr__(self):
66            return repr(self.value)
67
68    def __init__(self, capacity):
69        self._dict = dict()
70        self.capacity = capacity
71        self.head = None
72        self.tail = None
73
74    def __contains__(self, key):
75        return key in self._dict
76
77    def __iter__(self):
78        cur = self.head
79        while cur:
80            yield cur.key
81            cur = cur.nxt
82
83    def __len__(self):
84        return len(self._dict)
85
86    def __getitem__(self, key):
87        item = self._dict[key]
88        self._update_item(item)
89        return item.value
90
91    def __setitem__(self, key, value):
92        item = self._dict.get(key)
93        if item is None:
94            item = self._Item(key, value)
95            self._dict[key] = item
96            self._insert_item(item)
97        else:
98            item.value = value
99            self._update_item(item)
100            self._manage_size()
101
102    def __repr__(self):
103        return repr(self._dict)
104
105    def _insert_item(self, item):
106        item.prv = None
107        item.nxt = self.head
108        if self.head is not None:
109            self.head.prv = item
110        else:
111            self.tail = item
112        self.head = item
113        self._manage_size()
114
115    def _manage_size(self):
116        while len(self._dict) > self.capacity:
117            olditem = self._dict[self.tail.key]
118            del self._dict[self.tail.key]
119            if self.tail != self.head:
120                self.tail = self.tail.prv
121                self.tail.nxt = None
122            else:
123                self.head = self.tail = None
124
125    def _update_item(self, item):
126        if self.head == item:
127            return
128
129        prv = item.prv
130        prv.nxt = item.nxt
131        if item.nxt is not None:
132            item.nxt.prv = prv
133        else:
134            self.tail = prv
135
136        item.prv = None
137        item.nxt = self.head
138        self.head.prv = self.head = item
139
140
141def flatten(items):
142    """Flattens a potentially nested sequence into a flat list.
143   
144    :param items: the sequence to flatten
145   
146    >>> flatten((1, 2))
147    [1, 2]
148    >>> flatten([1, (2, 3), 4])
149    [1, 2, 3, 4]
150    >>> flatten([1, (2, [3, 4]), 5])
151    [1, 2, 3, 4, 5]
152    """
153    retval = []
154    for item in items:
155        if isinstance(item, (frozenset, list, set, tuple)):
156            retval += flatten(item)
157        else:
158            retval.append(item)
159    return retval
160
161
162def plaintext(text, keeplinebreaks=True):
163    """Return the text with all entities and tags removed.
164   
165    >>> plaintext('<b>1 &lt; 2</b>')
166    u'1 < 2'
167   
168    The `keeplinebreaks` parameter can be set to ``False`` to replace any line
169    breaks by simple spaces:
170   
171    >>> plaintext('''<b>1
172    ... &lt;
173    ... 2</b>''', keeplinebreaks=False)
174    u'1 < 2'
175   
176    :param text: the text to convert to plain text
177    :param keeplinebreaks: whether line breaks in the text should be kept intact
178    :return: the text with tags and entities removed
179    """
180    text = stripentities(striptags(text))
181    if not keeplinebreaks:
182        text = text.replace('\n', ' ')
183    return text
184
185
186_STRIPENTITIES_RE = re.compile(r'&(?:#((?:\d+)|(?:[xX][0-9a-fA-F]+));?|(\w+);)')
187def stripentities(text, keepxmlentities=False):
188    """Return a copy of the given text with any character or numeric entities
189    replaced by the equivalent UTF-8 characters.
190   
191    >>> stripentities('1 &lt; 2')
192    u'1 < 2'
193    >>> stripentities('more &hellip;')
194    u'more \u2026'
195    >>> stripentities('&#8230;')
196    u'\u2026'
197    >>> stripentities('&#x2026;')
198    u'\u2026'
199   
200    If the `keepxmlentities` parameter is provided and is a truth value, the
201    core XML entities (&amp;, &apos;, &gt;, &lt; and &quot;) are left intact.
202   
203    >>> stripentities('1 &lt; 2 &hellip;', keepxmlentities=True)
204    u'1 &lt; 2 \u2026'
205    """
206    def _replace_entity(match):
207        if match.group(1): # numeric entity
208            ref = match.group(1)
209            if ref.startswith('x'):
210                ref = int(ref[1:], 16)
211            else:
212                ref = int(ref, 10)
213            return unichr(ref)
214        else: # character entity
215            ref = match.group(2)
216            if keepxmlentities and ref in ('amp', 'apos', 'gt', 'lt', 'quot'):
217                return '&%s;' % ref
218            try:
219                return unichr(entities.name2codepoint[ref])
220            except KeyError:
221                if keepxmlentities:
222                    return '&amp;%s;' % ref
223                else:
224                    return ref
225    return _STRIPENTITIES_RE.sub(_replace_entity, text)
226
227
228_STRIPTAGS_RE = re.compile(r'(<!--.*?-->|<[^>]*>)')
229def striptags(text):
230    """Return a copy of the text with any XML/HTML tags removed.
231   
232    >>> striptags('<span>Foo</span> bar')
233    'Foo bar'
234    >>> striptags('<span class="bar">Foo</span>')
235    'Foo'
236    >>> striptags('Foo<br />')
237    'Foo'
238   
239    HTML/XML comments are stripped, too:
240   
241    >>> striptags('<!-- <blub>hehe</blah> -->test')
242    'test'
243   
244    :param text: the string to remove tags from
245    :return: the text with tags removed
246    """
247    return _STRIPTAGS_RE.sub('', text)
248
249
250def stringrepr(string):
251    ascii = string.encode('ascii', 'backslashreplace')
252    quoted = "'" +  ascii.replace("'", "\\'") + "'"
253    if len(ascii) > len(string):
254        return 'u' + quoted
255    return quoted
256
257
258# Compatibility fallback implementations for older Python versions
259
260try:
261    all = all
262    any = any
263except NameError:
264    def any(S):
265        for x in S:
266            if x:
267               return True
268        return False
269
270    def all(S):
271        for x in S:
272            if not x:
273               return False
274        return True
Note: See TracBrowser for help on using the repository browser.