Edgewall Software

source: trunk/genshi/tests/input.py

Last change on this file was 1219, checked in by hodgestar, 11 years ago

Fix handling of processing instructions that don't have data attached (patch from Neil Muller, fixes #368).

  • Property svn:eol-style set to native
File size: 11.3 KB
Line 
1# -*- coding: utf-8 -*-
2#
3# Copyright (C) 2006-2009 Edgewall Software
4# All rights reserved.
5#
6# This software is licensed as described in the file COPYING, which
7# you should have received as part of this distribution. The terms
8# are also available at http://genshi.edgewall.org/wiki/License.
9#
10# This software consists of voluntary contributions made by many
11# individuals. For the exact contribution history, see the revision
12# history and logs, available at http://genshi.edgewall.org/log/.
13
14import doctest
15import sys
16import unittest
17
18from genshi.core import Attrs, Stream
19from genshi.input import XMLParser, HTMLParser, ParseError
20from genshi.compat import StringIO, BytesIO
21
22
23class XMLParserTestCase(unittest.TestCase):
24
25    def test_text_node_pos_single_line(self):
26        text = '<elem>foo bar</elem>'
27        events = list(XMLParser(StringIO(text)))
28        kind, data, pos = events[1]
29        self.assertEqual(Stream.TEXT, kind)
30        self.assertEqual('foo bar', data)
31        self.assertEqual((None, 1, 6), pos)
32
33    def test_text_node_pos_multi_line(self):
34        text = '''<elem>foo
35bar</elem>'''
36        events = list(XMLParser(StringIO(text)))
37        kind, data, pos = events[1]
38        self.assertEqual(Stream.TEXT, kind)
39        self.assertEqual('foo\nbar', data)
40        self.assertEqual((None, 1, -1), pos)
41
42    def test_element_attribute_order(self):
43        text = '<elem title="baz" id="foo" class="bar" />'
44        events = list(XMLParser(StringIO(text)))
45        kind, data, pos = events[0]
46        self.assertEqual(Stream.START, kind)
47        tag, attrib = data
48        self.assertEqual('elem', tag)
49        self.assertEqual(('title', 'baz'), attrib[0])
50        self.assertEqual(('id', 'foo'), attrib[1])
51        self.assertEqual(('class', 'bar'), attrib[2])
52
53    def test_unicode_input(self):
54        text = u'<div>\u2013</div>'
55        events = list(XMLParser(StringIO(text)))
56        kind, data, pos = events[1]
57        self.assertEqual(Stream.TEXT, kind)
58        self.assertEqual(u'\u2013', data)
59
60    def test_latin1_encoded(self):
61        text = u'<div>\xf6</div>'.encode('iso-8859-1')
62        events = list(XMLParser(BytesIO(text), encoding='iso-8859-1'))
63        kind, data, pos = events[1]
64        self.assertEqual(Stream.TEXT, kind)
65        self.assertEqual(u'\xf6', data)
66
67    def test_latin1_encoded_xmldecl(self):
68        text = u"""<?xml version="1.0" encoding="iso-8859-1" ?>
69        <div>\xf6</div>
70        """.encode('iso-8859-1')
71        events = list(XMLParser(BytesIO(text)))
72        kind, data, pos = events[2]
73        self.assertEqual(Stream.TEXT, kind)
74        self.assertEqual(u'\xf6', data)
75
76    def test_html_entity_with_dtd(self):
77        text = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
78        "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
79        <html>&nbsp;</html>
80        """
81        events = list(XMLParser(StringIO(text)))
82        kind, data, pos = events[2]
83        self.assertEqual(Stream.TEXT, kind)
84        self.assertEqual(u'\xa0', data)
85
86    def test_html_entity_without_dtd(self):
87        text = '<html>&nbsp;</html>'
88        events = list(XMLParser(StringIO(text)))
89        kind, data, pos = events[1]
90        self.assertEqual(Stream.TEXT, kind)
91        self.assertEqual(u'\xa0', data)
92
93    def test_html_entity_in_attribute(self):
94        text = '<p title="&nbsp;"/>'
95        events = list(XMLParser(StringIO(text)))
96        kind, data, pos = events[0]
97        self.assertEqual(Stream.START, kind)
98        self.assertEqual(u'\xa0', data[1].get('title'))
99        kind, data, pos = events[1]
100        self.assertEqual(Stream.END, kind)
101
102    def test_undefined_entity_with_dtd(self):
103        text = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
104        "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
105        <html>&junk;</html>
106        """
107        events = XMLParser(StringIO(text))
108        self.assertRaises(ParseError, list, events)
109
110    def test_undefined_entity_without_dtd(self):
111        text = '<html>&junk;</html>'
112        events = XMLParser(StringIO(text))
113        self.assertRaises(ParseError, list, events)
114
115
116class HTMLParserTestCase(unittest.TestCase):
117
118    def test_text_node_pos_single_line(self):
119        text = u'<elem>foo bar</elem>'
120        events = list(HTMLParser(StringIO(text)))
121        kind, data, pos = events[1]
122        self.assertEqual(Stream.TEXT, kind)
123        self.assertEqual('foo bar', data)
124        self.assertEqual((None, 1, 6), pos)
125
126    def test_text_node_pos_multi_line(self):
127        text = u'''<elem>foo
128bar</elem>'''
129        events = list(HTMLParser(StringIO(text)))
130        kind, data, pos = events[1]
131        self.assertEqual(Stream.TEXT, kind)
132        self.assertEqual('foo\nbar', data)
133        self.assertEqual((None, 1, 6), pos)
134
135    def test_input_encoding_text(self):
136        text = u'<div>\xf6</div>'.encode('iso-8859-1')
137        events = list(HTMLParser(BytesIO(text), encoding='iso-8859-1'))
138        kind, data, pos = events[1]
139        self.assertEqual(Stream.TEXT, kind)
140        self.assertEqual(u'\xf6', data)
141
142    def test_input_encoding_attribute(self):
143        text = u'<div title="\xf6"></div>'.encode('iso-8859-1')
144        events = list(HTMLParser(BytesIO(text), encoding='iso-8859-1'))
145        kind, (tag, attrib), pos = events[0]
146        self.assertEqual(Stream.START, kind)
147        self.assertEqual(u'\xf6', attrib.get('title'))
148
149    def test_unicode_input(self):
150        text = u'<div>\u2013</div>'
151        events = list(HTMLParser(StringIO(text)))
152        kind, data, pos = events[1]
153        self.assertEqual(Stream.TEXT, kind)
154        self.assertEqual(u'\u2013', data)
155
156    def test_html_entity_in_attribute(self):
157        text = u'<p title="&nbsp;"></p>'
158        events = list(HTMLParser(StringIO(text)))
159        kind, data, pos = events[0]
160        self.assertEqual(Stream.START, kind)
161        self.assertEqual(u'\xa0', data[1].get('title'))
162        kind, data, pos = events[1]
163        self.assertEqual(Stream.END, kind)
164
165    def test_html_entity_in_text(self):
166        text = u'<p>&nbsp;</p>'
167        events = list(HTMLParser(StringIO(text)))
168        kind, data, pos = events[1]
169        self.assertEqual(Stream.TEXT, kind)
170        self.assertEqual(u'\xa0', data)
171
172    def test_processing_instruction(self):
173        text = u'<?php echo "Foobar" ?>'
174        events = list(HTMLParser(StringIO(text)))
175        kind, (target, data), pos = events[0]
176        self.assertEqual(Stream.PI, kind)
177        self.assertEqual('php', target)
178        self.assertEqual('echo "Foobar"', data)
179
180    def test_processing_instruction_no_data_1(self):
181        text = u'<?foo ?>'
182        events = list(HTMLParser(StringIO(text)))
183        kind, (target, data), pos = events[0]
184        self.assertEqual(Stream.PI, kind)
185        self.assertEqual('foo', target)
186        self.assertEqual('', data)
187
188    def test_processing_instruction_no_data_2(self):
189        text = u'<?experiment>...<?/experiment>'
190        events = list(HTMLParser(StringIO(text)))
191        kind, (target, data), pos = events[0]
192        self.assertEqual(Stream.PI, kind)
193        self.assertEqual('experiment', target)
194        self.assertEqual('', data)
195        kind, (target, data), pos = events[2]
196        self.assertEqual('/experiment', target)
197        self.assertEqual('', data)
198
199    def test_xmldecl(self):
200        text = '<?xml version="1.0" ?><root />'
201        events = list(XMLParser(StringIO(text)))
202        kind, (version, encoding, standalone), pos = events[0]
203        self.assertEqual(Stream.XML_DECL, kind)
204        self.assertEqual('1.0', version)
205        self.assertEqual(None, encoding)
206        self.assertEqual(-1, standalone)
207
208    def test_xmldecl_encoding(self):
209        text = '<?xml version="1.0" encoding="utf-8" ?><root />'
210        events = list(XMLParser(StringIO(text)))
211        kind, (version, encoding, standalone), pos = events[0]
212        self.assertEqual(Stream.XML_DECL, kind)
213        self.assertEqual('1.0', version)
214        self.assertEqual('utf-8', encoding)
215        self.assertEqual(-1, standalone)
216
217    def test_xmldecl_standalone(self):
218        text = '<?xml version="1.0" standalone="yes" ?><root />'
219        events = list(XMLParser(StringIO(text)))
220        kind, (version, encoding, standalone), pos = events[0]
221        self.assertEqual(Stream.XML_DECL, kind)
222        self.assertEqual('1.0', version)
223        self.assertEqual(None, encoding)
224        self.assertEqual(1, standalone)
225
226    def test_processing_instruction_trailing_qmark(self):
227        text = u'<?php echo "Foobar" ??>'
228        events = list(HTMLParser(StringIO(text)))
229        kind, (target, data), pos = events[0]
230        self.assertEqual(Stream.PI, kind)
231        self.assertEqual('php', target)
232        self.assertEqual('echo "Foobar" ?', data)
233
234    def test_out_of_order_tags1(self):
235        text = u'<span><b>Foobar</span></b>'
236        events = list(HTMLParser(StringIO(text)))
237        self.assertEqual(5, len(events))
238        self.assertEqual((Stream.START, ('span', ())), events[0][:2])
239        self.assertEqual((Stream.START, ('b', ())), events[1][:2])
240        self.assertEqual((Stream.TEXT, 'Foobar'), events[2][:2])
241        self.assertEqual((Stream.END, 'b'), events[3][:2])
242        self.assertEqual((Stream.END, 'span'), events[4][:2])
243
244    def test_out_of_order_tags2(self):
245        text = u'<span class="baz"><b><i>Foobar</span></b></i>'.encode('utf-8')
246        events = list(HTMLParser(BytesIO(text), encoding='utf-8'))
247        self.assertEqual(7, len(events))
248        self.assertEqual((Stream.START, ('span', Attrs([('class', 'baz')]))),
249                         events[0][:2])
250        self.assertEqual((Stream.START, ('b', ())), events[1][:2])
251        self.assertEqual((Stream.START, ('i', ())), events[2][:2])
252        self.assertEqual((Stream.TEXT, 'Foobar'), events[3][:2])
253        self.assertEqual((Stream.END, 'i'), events[4][:2])
254        self.assertEqual((Stream.END, 'b'), events[5][:2])
255        self.assertEqual((Stream.END, 'span'), events[6][:2])
256
257    def test_out_of_order_tags3(self):
258        text = u'<span><b>Foobar</i>'.encode('utf-8')
259        events = list(HTMLParser(BytesIO(text), encoding='utf-8'))
260        self.assertEqual(5, len(events))
261        self.assertEqual((Stream.START, ('span', ())), events[0][:2])
262        self.assertEqual((Stream.START, ('b', ())), events[1][:2])
263        self.assertEqual((Stream.TEXT, 'Foobar'), events[2][:2])
264        self.assertEqual((Stream.END, 'b'), events[3][:2])
265        self.assertEqual((Stream.END, 'span'), events[4][:2])
266
267    def test_hex_charref(self):
268        text = u'<span>&#x27;</span>'
269        events = list(HTMLParser(StringIO(text)))
270        self.assertEqual(3, len(events))
271        self.assertEqual((Stream.START, ('span', ())), events[0][:2])
272        self.assertEqual((Stream.TEXT, "'"), events[1][:2])
273        self.assertEqual((Stream.END, 'span'), events[2][:2])
274
275    def test_multibyte_character_on_chunk_boundary(self):
276        text = u'a' * ((4 * 1024) - 1) + u'\xe6'
277        events = list(HTMLParser(BytesIO(text.encode('utf-8')),
278                                 encoding='utf-8'))
279        self.assertEqual(1, len(events))
280        self.assertEqual((Stream.TEXT, text), events[0][:2])
281
282
283def suite():
284    suite = unittest.TestSuite()
285    suite.addTest(doctest.DocTestSuite(XMLParser.__module__))
286    suite.addTest(unittest.makeSuite(XMLParserTestCase, 'test'))
287    suite.addTest(unittest.makeSuite(HTMLParserTestCase, 'test'))
288    return suite
289
290if __name__ == '__main__':
291    unittest.main(defaultTest='suite')
Note: See TracBrowser for help on using the repository browser.