Changeset 185 for trunk/markup/input.py
- Timestamp:
- Aug 11, 2006, 6:34:35 PM (17 years ago)
- File:
-
- 1 edited
-
trunk/markup/input.py (modified) (8 diffs)
Legend:
- Unmodified
- Added
- Removed
-
trunk/markup/input.py
r184 r185 12 12 # history and logs, available at http://markup.edgewall.org/log/. 13 13 14 from itertools import chain 14 15 from xml.parsers import expat 15 16 try: … … 22 23 23 24 from markup.core import Attributes, Markup, QName, Stream 25 from markup.core import DOCTYPE, START, END, START_NS, END_NS, TEXT, \ 26 START_CDATA, END_CDATA, PI, COMMENT 27 28 __all__ = ['ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML'] 24 29 25 30 … … 83 88 self._queue = [] 84 89 90 def parse(self): 91 def _generate(): 92 try: 93 bufsize = 4 * 1024 # 4K 94 done = False 95 while 1: 96 while not done and len(self._queue) == 0: 97 data = self.source.read(bufsize) 98 if data == '': # end of data 99 if hasattr(self, 'expat'): 100 self.expat.Parse('', True) 101 del self.expat # get rid of circular references 102 done = True 103 else: 104 self.expat.Parse(data, False) 105 for event in self._queue: 106 yield event 107 self._queue = [] 108 if done: 109 break 110 except expat.ExpatError, e: 111 msg = str(e) 112 if self.filename: 113 msg += ', in ' + self.filename 114 raise ParseError(msg, self.filename, e.lineno, e.offset) 115 return Stream(_generate()).filter(CoalesceFilter()) 116 85 117 def __iter__(self): 86 try: 87 bufsize = 4 * 1024 # 4K 88 done = False 89 while 1: 90 while not done and len(self._queue) == 0: 91 data = self.source.read(bufsize) 92 if data == '': # end of data 93 if hasattr(self, 'expat'): 94 self.expat.Parse('', True) 95 del self.expat # get rid of circular references 96 done = True 97 else: 98 self.expat.Parse(data, False) 99 for event in self._queue: 100 yield event 101 self._queue = [] 102 if done: 103 break 104 except expat.ExpatError, e: 105 msg = str(e) 106 if self.filename: 107 msg += ', in ' + self.filename 108 raise ParseError(msg, self.filename, e.lineno, e.offset) 118 return iter(self.parse()) 109 119 110 120 def _enqueue(self, kind, data=None, pos=None): 111 121 if pos is None: 112 122 pos = self._getpos() 113 if kind is Stream.TEXT:123 if kind is TEXT: 114 124 # Expat reports the *end* of the text event as current position. We 115 125 # try to fix that up here as much as possible. Unfortunately, the … … 135 145 136 146 def _handle_start(self, tag, attrib): 137 self._enqueue(S tream.START, (QName(tag), Attributes(attrib.items())))147 self._enqueue(START, (QName(tag), Attributes(attrib.items()))) 138 148 139 149 def _handle_end(self, tag): 140 self._enqueue( Stream.END, QName(tag))150 self._enqueue(END, QName(tag)) 141 151 142 152 def _handle_data(self, text): 143 self._enqueue( Stream.TEXT, text)153 self._enqueue(TEXT, text) 144 154 145 155 def _handle_doctype(self, name, sysid, pubid, has_internal_subset): 146 self._enqueue( Stream.DOCTYPE, (name, pubid, sysid))156 self._enqueue(DOCTYPE, (name, pubid, sysid)) 147 157 148 158 def _handle_start_ns(self, prefix, uri): 149 self._enqueue(S tream.START_NS, (prefix or '', uri))159 self._enqueue(START_NS, (prefix or '', uri)) 150 160 151 161 def _handle_end_ns(self, prefix): 152 self._enqueue( Stream.END_NS, prefix or '')162 self._enqueue(END_NS, prefix or '') 153 163 154 164 def _handle_start_cdata(self): 155 self._enqueue(S tream.START_CDATA)165 self._enqueue(START_CDATA) 156 166 157 167 def _handle_end_cdata(self): 158 self._enqueue( Stream.END_CDATA)168 self._enqueue(END_CDATA) 159 169 160 170 def _handle_pi(self, target, data): 161 self._enqueue( Stream.PI, (target, data))171 self._enqueue(PI, (target, data)) 162 172 163 173 def _handle_comment(self, text): 164 self._enqueue( Stream.COMMENT, text)174 self._enqueue(COMMENT, text) 165 175 166 176 def _handle_other(self, text): … … 169 179 try: 170 180 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) 171 self._enqueue( Stream.TEXT, text)181 self._enqueue(TEXT, text) 172 182 except KeyError: 173 183 lineno, offset = self._getpos() … … 209 219 self._open_tags = [] 210 220 221 def parse(self): 222 def _generate(): 223 try: 224 bufsize = 4 * 1024 # 4K 225 done = False 226 while 1: 227 while not done and len(self._queue) == 0: 228 data = self.source.read(bufsize) 229 if data == '': # end of data 230 self.close() 231 done = True 232 else: 233 self.feed(data) 234 for kind, data, pos in self._queue: 235 yield kind, data, pos 236 self._queue = [] 237 if done: 238 open_tags = self._open_tags 239 open_tags.reverse() 240 for tag in open_tags: 241 yield END, QName(tag), pos 242 break 243 except html.HTMLParseError, e: 244 msg = '%s: line %d, column %d' % (e.msg, e.lineno, e.offset) 245 if self.filename: 246 msg += ', in %s' % self.filename 247 raise ParseError(msg, self.filename, e.lineno, e.offset) 248 return Stream(_generate()).filter(CoalesceFilter()) 249 211 250 def __iter__(self): 212 try: 213 bufsize = 4 * 1024 # 4K 214 done = False 215 while 1: 216 while not done and len(self._queue) == 0: 217 data = self.source.read(bufsize) 218 if data == '': # end of data 219 self.close() 220 done = True 221 else: 222 self.feed(data) 223 for kind, data, pos in self._queue: 224 yield kind, data, pos 225 self._queue = [] 226 if done: 227 open_tags = self._open_tags 228 open_tags.reverse() 229 for tag in open_tags: 230 yield Stream.END, QName(tag), pos 231 break 232 except html.HTMLParseError, e: 233 msg = '%s: line %d, column %d' % (e.msg, e.lineno, e.offset) 234 if self.filename: 235 msg += ', in %s' % self.filename 236 raise ParseError(msg, self.filename, e.lineno, e.offset) 251 return iter(self.parse()) 237 252 238 253 def _enqueue(self, kind, data, pos=None): … … 252 267 fixed_attrib.append((name, unicode(value))) 253 268 254 self._enqueue(S tream.START, (QName(tag), Attributes(fixed_attrib)))269 self._enqueue(START, (QName(tag), Attributes(fixed_attrib))) 255 270 if tag in self._EMPTY_ELEMS: 256 self._enqueue( Stream.END, QName(tag))271 self._enqueue(END, QName(tag)) 257 272 else: 258 273 self._open_tags.append(tag) … … 264 279 if open_tag.lower() == tag.lower(): 265 280 break 266 self._enqueue( Stream.END, QName(open_tag))267 self._enqueue( Stream.END, QName(tag))281 self._enqueue(END, QName(open_tag)) 282 self._enqueue(END, QName(tag)) 268 283 269 284 def handle_data(self, text): 270 self._enqueue( Stream.TEXT, text)285 self._enqueue(TEXT, text) 271 286 272 287 def handle_charref(self, name): 273 self._enqueue(Stream.TEXT, Markup('&#%s;' % name)) 288 text = unichr(int(name)) 289 self._enqueue(TEXT, text) 274 290 275 291 def handle_entityref(self, name): 276 self._enqueue(Stream.TEXT, Markup('&%s;' % name)) 292 try: 293 text = unichr(htmlentitydefs.name2codepoint[name]) 294 except KeyError: 295 text = '&%s;' % name 296 self._enqueue(TEXT, text) 277 297 278 298 def handle_pi(self, data): 279 299 target, data = data.split(maxsplit=1) 280 300 data = data.rstrip('?') 281 self._enqueue( Stream.PI, (target.strip(), data.strip()))301 self._enqueue(PI, (target.strip(), data.strip())) 282 302 283 303 def handle_comment(self, text): 284 self._enqueue( Stream.COMMENT, text)304 self._enqueue(COMMENT, text) 285 305 286 306 287 307 def HTML(text): 288 308 return Stream(list(HTMLParser(StringIO(text)))) 309 310 311 class CoalesceFilter(object): 312 """Coalesces adjacent TEXT events into a single event.""" 313 314 def __call__(self, stream, ctxt=None): 315 textbuf = [] 316 textpos = None 317 for kind, data, pos in chain(stream, [(None, None, None)]): 318 if kind is TEXT: 319 textbuf.append(data) 320 if textpos is None: 321 textpos = pos 322 else: 323 if textbuf: 324 yield TEXT, u''.join(textbuf), textpos 325 del textbuf[:] 326 textpos = None 327 if kind: 328 yield kind, data, pos
Note: See TracChangeset
for help on using the changeset viewer.
