products/CPSRSS

view feedparser.py @ 289:4c69e58c3c6a

hgbundler made release tag
author Georges Racinet on purity.racinet.fr <georges@racinet.fr>
date Wed, 23 Nov 2011 19:59:09 +0100
parents 541c41ec7e39
children
line source
1 #!/usr/bin/env python
2 """Universal feed parser
4 Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom feeds
6 Visit http://feedparser.org/ for the latest version
7 Visit http://feedparser.org/docs/ for the latest documentation
9 Required: Python 2.1 or later
10 Recommended: Python 2.3 or later
11 Recommended: libxml2 <http://xmlsoft.org/python.html>
12 """
14 #__version__ = "3.2-" + "$Revision$"[11:15] + "-cvs"
15 __version__ = "3.2"
16 __author__ = "Mark Pilgrim <http://diveintomark.org/>"
17 __copyright__ = "Copyright 2002-4, Mark Pilgrim"
18 __contributors__ = ["Jason Diamond <http://injektilo.org/>",
19 "John Beimler <http://john.beimler.org/>",
20 "Fazal Majid <http://www.majid.info/mylos/weblog/>",
21 "Aaron Swartz <http://aaronsw.com>"]
22 __license__ = "Python"
23 _debug = 0
25 # HTTP "User-Agent" header to send to servers when downloading feeds.
26 # If you are embedding feedparser in a larger application, you should
27 # change this to your application name and URL.
28 USER_AGENT = "UniversalFeedParser/%s%s +http://feedparser.org/" % (__version__, _debug and "-debug" or "")
30 # HTTP "Accept" header to send to servers when downloading feeds. If you don't
31 # want to send an Accept header, set this to None.
32 ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
34 # List of preferred XML parsers, by SAX driver name. These will be tried first,
35 # but if they're not installed, Python will keep searching through its own list
36 # of pre-installed parsers until it finds one that supports everything we need.
37 PREFERRED_XML_PARSERS = ["drv_libxml2"]
39 # If you want feedparser to automatically run HTML markup through HTML Tidy, set
40 # this to 1. This is off by default because of reports of crashing on some
41 # platforms. If it crashes for you, please submit a bug report with your OS
42 # platform, Python version, and the URL of the feed you were attempting to parse.
43 # Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
44 TIDY_MARKUP = 0
46 # ---------- required modules (should come with any Python distribution) ----------
47 import sgmllib, re, sys, copy, urlparse, time, rfc822, types
48 try:
49 from cStringIO import StringIO as _StringIO
50 except:
51 from StringIO import StringIO as _StringIO
53 # ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
55 # gzip is included with most Python distributions, but may not be available if you compiled your own
56 try:
57 import gzip
58 except:
59 gzip = None
60 try:
61 import zlib
62 except:
63 zlib = None
65 import socket
66 if hasattr(socket, 'setdefaulttimeout'):
67 socket.setdefaulttimeout(20)
69 import urllib, urllib2
71 _mxtidy = None
72 if TIDY_MARKUP:
73 try:
74 from mx.Tidy import Tidy as _mxtidy
75 except:
76 pass
78 # If a real XML parser is available, feedparser will attempt to use it. feedparser has
79 # been tested with the built-in SAX parser, PyXML, and libxml2. On platforms where the
80 # Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
81 # versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
82 try:
83 import xml.sax
84 xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
85 from xml.sax.saxutils import escape as _xmlescape
86 _XML_AVAILABLE = 1
87 except:
88 _XML_AVAILABLE = 0
89 def _xmlescape(data):
90 data = data.replace("&", "&amp;")
91 data = data.replace(">", "&gt;")
92 data = data.replace("<", "&lt;")
93 return data
95 # base64 support for Atom feeds that contain embedded binary data
96 try:
97 import base64, binascii
98 except:
99 base64 = binascii = None
101 # cjkcodecs and iconv_codec provide support for more character encodings.
102 # Both are available from http://cjkpython.i18n.org/
103 try:
104 import cjkcodecs.aliases
105 except:
106 pass
107 try:
108 import iconv_codec
109 except:
110 pass
112 # ---------- don't touch these ----------
113 class CharacterEncodingOverride(Exception): pass
114 class CharacterEncodingUnknown(Exception): pass
116 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
117 sgmllib.special = re.compile('<!')
118 sgmllib.charref = re.compile('&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]')
120 SUPPORTED_VERSIONS = {'': 'unknown',
121 'rss090': 'RSS 0.90',
122 'rss091n': 'RSS 0.91 (Netscape)',
123 'rss091u': 'RSS 0.91 (Userland)',
124 'rss092': 'RSS 0.92',
125 'rss093': 'RSS 0.93',
126 'rss094': 'RSS 0.94',
127 'rss20': 'RSS 2.0',
128 'rss10': 'RSS 1.0',
129 'rss': 'RSS (unknown version)',
130 'atom01': 'Atom 0.1',
131 'atom02': 'Atom 0.2',
132 'atom03': 'Atom 0.3',
133 'atom': 'Atom (unknown version)',
134 'cdf': 'CDF',
135 'hotrss': 'Hot RSS'
136 }
138 try:
139 dict
140 except NameError:
141 # Python 2.1 does not have a built-in dict() function
142 def dict(aList):
143 rc = {}
144 for k, v in aList:
145 rc[k] = v
146 return rc
148 from UserDict import UserDict
149 class FeedParserDict(UserDict):
150 def __getitem__(self, key):
151 keymap = {'channel': 'feed',
152 'items': 'entries',
153 'guid': 'id',
154 'date': 'modified',
155 'date_parsed': 'modified_parsed'}
156 key = keymap.get(key, key)
157 return UserDict.__getitem__(self, key)
159 def has_key(self, key):
160 return hasattr(self, key) or UserDict.has_key(self, key)
162 def __getattr__(self, key):
163 try:
164 return self.__dict__[key]
165 except KeyError:
166 pass
167 try:
168 return self.__getitem__(key)
169 except:
170 raise AttributeError, "object has no attribute '%s'" % key
172 def __contains__(self, key):
173 return self.has_key(key)
175 def _ebcdic_to_ascii(str):
176 ebcdic_to_ascii_map = (
177 0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
178 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
179 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
180 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
181 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
182 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
183 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
184 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
185 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,
186 202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,
187 209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,
188 216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,
189 123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,
190 125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,
191 92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
192 48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
193 )
194 newstr = []
195 for ix in xrange(len(str)):
196 newstr.append(chr(ebcdic_to_ascii_map[ord(str[ix])]))
197 return "".join(newstr)
199 class _FeedParserMixin:
200 namespaces = {"": "",
201 "http://backend.userland.com/rss": "",
202 "http://blogs.law.harvard.edu/tech/rss": "",
203 "http://purl.org/rss/1.0/": "",
204 "http://my.netscape.com/rdf/simple/0.9/": "",
205 "http://example.com/newformat#": "",
206 "http://example.com/necho": "",
207 "http://purl.org/echo/": "",
208 "uri/of/echo/namespace#": "",
209 "http://purl.org/pie/": "",
210 "http://purl.org/atom/ns#": "",
211 "http://purl.org/rss/1.0/modules/rss091#": "",
213 "http://webns.net/mvcb/": "admin",
214 "http://purl.org/rss/1.0/modules/aggregation/": "ag",
215 "http://purl.org/rss/1.0/modules/annotate/": "annotate",
216 "http://media.tangent.org/rss/1.0/": "audio",
217 "http://backend.userland.com/blogChannelModule": "blogChannel",
218 "http://web.resource.org/cc/": "cc",
219 "http://backend.userland.com/creativeCommonsRssModule": "creativeCommons",
220 "http://purl.org/rss/1.0/modules/company": "co",
221 "http://purl.org/rss/1.0/modules/content/": "content",
222 "http://my.theinfo.org/changed/1.0/rss/": "cp",
223 "http://purl.org/dc/elements/1.1/": "dc",
224 "http://purl.org/dc/terms/": "dcterms",
225 "http://purl.org/rss/1.0/modules/email/": "email",
226 "http://purl.org/rss/1.0/modules/event/": "ev",
227 "http://postneo.com/icbm/": "icbm",
228 "http://purl.org/rss/1.0/modules/image/": "image",
229 "http://xmlns.com/foaf/0.1/": "foaf",
230 "http://freshmeat.net/rss/fm/": "fm",
231 "http://purl.org/rss/1.0/modules/link/": "l",
232 "http://madskills.com/public/xml/rss/module/pingback/": "pingback",
233 "http://prismstandard.org/namespaces/1.2/basic/": "prism",
234 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
235 "http://www.w3.org/2000/01/rdf-schema#": "rdfs",
236 "http://purl.org/rss/1.0/modules/reference/": "ref",
237 "http://purl.org/rss/1.0/modules/richequiv/": "reqv",
238 "http://purl.org/rss/1.0/modules/search/": "search",
239 "http://purl.org/rss/1.0/modules/slash/": "slash",
240 "http://purl.org/rss/1.0/modules/servicestatus/": "ss",
241 "http://hacks.benhammersley.com/rss/streaming/": "str",
242 "http://purl.org/rss/1.0/modules/subscription/": "sub",
243 "http://purl.org/rss/1.0/modules/syndication/": "sy",
244 "http://purl.org/rss/1.0/modules/taxonomy/": "taxo",
245 "http://purl.org/rss/1.0/modules/threading/": "thr",
246 "http://purl.org/rss/1.0/modules/textinput/": "ti",
247 "http://madskills.com/public/xml/rss/module/trackback/":"trackback",
248 "http://wellformedweb.org/CommentAPI/": "wfw",
249 "http://purl.org/rss/1.0/modules/wiki/": "wiki",
250 "http://schemas.xmlsoap.org/soap/envelope/": "soap",
251 "http://www.w3.org/1999/xhtml": "xhtml",
252 "http://www.w3.org/XML/1998/namespace": "xml"
253 }
255 can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'comments']
256 can_contain_relative_uris = ['content', 'description', 'title', 'summary', 'info', 'tagline', 'copyright']
257 can_contain_dangerous_markup = ['content', 'description', 'title', 'summary', 'info', 'tagline', 'copyright']
258 html_types = ['text/html', 'application/xhtml+xml']
260 def __init__(self, baseuri=None, encoding='utf-8'):
261 if _debug: sys.stderr.write("initializing FeedParser\n")
262 self.feeddata = FeedParserDict() # feed-level data
263 self.encoding = encoding # character encoding
264 self.entries = [] # list of entry-level data
265 self.version = '' # feed type/version, see SUPPORTED_VERSIONS
267 # the following are used internally to track state;
268 # some of this is kind of out of control and should
269 # probably be refactored into a finite state machine
270 self.infeed = 0
271 self.inentry = 0
272 self.incontent = 0
273 self.intextinput = 0
274 self.inimage = 0
275 self.inauthor = 0
276 self.incontributor = 0
277 self.contentparams = FeedParserDict()
278 self.namespacemap = {}
279 self.elementstack = []
280 self.basestack = []
281 self.langstack = []
282 self.baseuri = baseuri or ''
283 self.lang = None
285 def unknown_starttag(self, tag, attrs):
286 if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))
287 # normalize attrs
288 attrs = [(k.lower(), v) for k, v in attrs]
289 attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
291 # track xml:base and xml:lang
292 attrsD = dict(attrs)
293 baseuri = attrsD.get('xml:base', attrsD.get('base'))
294 if baseuri:
295 self.baseuri = baseuri
296 lang = attrsD.get('xml:lang', attrsD.get('lang'))
297 if lang:
298 if (tag in ('feed', 'rss', 'rdf:RDF')) and (not self.lang):
299 self.feeddata['language'] = lang
300 self.lang = lang
301 self.basestack.append(baseuri)
302 self.langstack.append(lang)
304 # track namespaces
305 for prefix, uri in attrs:
306 if prefix.startswith('xmlns:'):
307 self.trackNamespace(prefix[6:], uri)
308 elif prefix == 'xmlns':
309 self.trackNamespace(None, uri)
311 # track inline content
312 if self.incontent and self.contentparams.get('mode') == 'escaped':
313 # element declared itself as escaped markup, but it isn't really
314 self.contentparams['mode'] = 'xml'
315 if self.incontent and self.contentparams.get('mode') == 'xml':
316 # Note: probably shouldn't simply recreate localname here, but
317 # our namespace handling isn't actually 100% correct in cases where
318 # the feed redefines the default namespace (which is actually
319 # the usual case for inline content, thanks Sam), so here we
320 # cheat and just reconstruct the element based on localname
321 # because that compensates for the bugs in our namespace handling.
322 # This will horribly munge inline content with non-empty qnames,
323 # but nobody actually does that, so I'm not fixing it.
324 tag = tag.split(':')[-1]
325 return self.handle_data("<%s%s>" % (tag, "".join([' %s="%s"' % t for t in attrs])), escape=0)
327 # match namespaces
328 if tag.find(':') <> -1:
329 prefix, suffix = tag.split(':', 1)
330 else:
331 prefix, suffix = '', tag
332 prefix = self.namespacemap.get(prefix, prefix)
333 if prefix:
334 prefix = prefix + '_'
336 # special hack for better tracking of empty textinput/image elements in illformed feeds
337 if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
338 self.intextinput = 0
339 if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'width', 'height'):
340 self.inimage = 0
342 # call special handler (if defined) or default handler
343 methodname = '_start_' + prefix + suffix
344 try:
345 method = getattr(self, methodname)
346 return method(attrsD)
347 except AttributeError:
348 return self.push(prefix + suffix, 1)
350 def unknown_endtag(self, tag):
351 if _debug: sys.stderr.write('end %s\n' % tag)
352 # match namespaces
353 if tag.find(':') <> -1:
354 prefix, suffix = tag.split(':', 1)
355 else:
356 prefix, suffix = '', tag
357 prefix = self.namespacemap.get(prefix, prefix)
358 if prefix:
359 prefix = prefix + '_'
361 # call special handler (if defined) or default handler
362 methodname = '_end_' + prefix + suffix
363 try:
364 method = getattr(self, methodname)
365 method()
366 except AttributeError:
367 self.pop(prefix + suffix)
369 # track inline content
370 if self.incontent and self.contentparams.get('mode') == 'escaped':
371 # element declared itself as escaped markup, but it isn't really
372 self.contentparams['mode'] = 'xml'
373 if self.incontent and self.contentparams.get('mode') == 'xml':
374 tag = tag.split(':')[-1]
375 self.handle_data("</%s>" % tag, escape=0)
377 # track xml:base and xml:lang going out of scope
378 if self.basestack:
379 self.basestack.pop()
380 if self.basestack and self.basestack[-1]:
381 baseuri = self.basestack[-1]
382 self.baseuri = baseuri
383 if self.langstack:
384 lang = self.langstack.pop()
385 if lang:
386 self.lang = lang
388 def handle_charref(self, ref):
389 # called for each character reference, e.g. for "&#160;", ref will be "160"
390 if not self.elementstack: return
391 ref = ref.lower()
392 if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
393 text = "&#%s;" % ref
394 else:
395 if ref[0] == 'x':
396 c = int(ref[1:], 16)
397 else:
398 c = int(ref)
399 text = unichr(c).encode('utf-8')
400 self.elementstack[-1][2].append(text)
402 def handle_entityref(self, ref):
403 # called for each entity reference, e.g. for "&copy;", ref will be "copy"
404 if not self.elementstack: return
405 if _debug: sys.stderr.write("entering handle_entityref with %s\n" % ref)
406 if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
407 text = '&%s;' % ref
408 else:
409 # entity resolution graciously donated by Aaron Swartz
410 def name2cp(k):
411 import htmlentitydefs
412 if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
413 return htmlentitydefs.name2codepoint[k]
414 k = htmlentitydefs.entitydefs[k]
415 if k.startswith("&#") and k.endswith(";"):
416 return int(k[2:-1]) # not in latin-1
417 return ord(k)
418 try: name2cp(ref)
419 except KeyError: text = "&%s;" % ref
420 else: text = unichr(name2cp(ref)).encode('utf-8')
421 self.elementstack[-1][2].append(text)
423 def handle_data(self, text, escape=1):
424 # called for each block of plain text, i.e. outside of any tag and
425 # not containing any character or entity references
426 if not self.elementstack: return
427 if escape and self.contentparams.get('mode') == 'xml':
428 text = _xmlescape(text)
429 self.elementstack[-1][2].append(text)
431 def handle_comment(self, text):
432 # called for each comment, e.g. <!-- insert message here -->
433 pass
435 def handle_pi(self, text):
436 # called for each processing instruction, e.g. <?instruction>
437 pass
439 def handle_decl(self, text):
440 pass
442 def parse_declaration(self, i):
443 # override internal declaration handler to handle CDATA blocks
444 if _debug: sys.stderr.write("entering parse_declaration\n")
445 if self.rawdata[i:i+9] == '<![CDATA[':
446 k = self.rawdata.find(']]>', i)
447 if k == -1: k = len(self.rawdata)
448 self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
449 return k+3
450 else:
451 k = self.rawdata.find('>', i)
452 return k+1
454 def trackNamespace(self, prefix, uri):
455 if (prefix, uri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
456 self.version = 'rss090'
457 if (prefix, uri) == (None, 'http://purl.org/rss/1.0/') and not self.version:
458 self.version = 'rss10'
459 if not prefix: return
460 if uri.find('backend.userland.com/rss') <> -1:
461 # match any backend.userland.com namespace
462 uri = 'http://backend.userland.com/rss'
463 if self.namespaces.has_key(uri):
464 self.namespacemap[prefix] = self.namespaces[uri]
466 def resolveURI(self, uri):
467 return urlparse.urljoin(self.baseuri or '', uri)
469 def decodeEntities(self, element, data):
470 return data
472 def push(self, element, expectingText):
473 self.elementstack.append([element, expectingText, []])
475 def pop(self, element):
476 if not self.elementstack: return
477 if self.elementstack[-1][0] != element: return
479 element, expectingText, pieces = self.elementstack.pop()
480 output = "".join(pieces)
481 output = output.strip()
482 if not expectingText: return output
484 # decode base64 content
485 if self.contentparams.get('mode') == 'base64' and base64:
486 try:
487 output = base64.decodestring(output)
488 except binascii.Error:
489 pass
490 except binascii.Incomplete:
491 pass
493 # resolve relative URIs
494 if (element in self.can_be_relative_uri) and output:
495 output = self.resolveURI(output)
497 # decode entities within embedded markup
498 output = self.decodeEntities(element, output)
500 # resolve relative URIs within embedded markup
501 if self.contentparams.get('type', 'text/html') in self.html_types:
502 if element in self.can_contain_relative_uris:
503 output = _resolveRelativeURIs(output, self.baseuri, self.encoding)
505 # sanitize embedded markup
506 if self.contentparams.get('type', 'text/html') in self.html_types:
507 if element in self.can_contain_dangerous_markup:
508 output = _sanitizeHTML(output, self.encoding)
510 if self.encoding and (type(output) == types.StringType):
511 try:
512 output = unicode(output, self.encoding)
513 except:
514 pass
516 # store output in appropriate place(s)
517 if self.inentry:
518 if element == 'content':
519 self.entries[-1].setdefault(element, [])
520 contentparams = copy.deepcopy(self.contentparams)
521 contentparams['value'] = output
522 self.entries[-1][element].append(contentparams)
523 elif element == 'category':
524 self.entries[-1][element] = output
525 domain = self.entries[-1]['categories'][-1][0]
526 self.entries[-1]['categories'][-1] = (domain, output)
527 elif element == 'source':
528 self.entries[-1]['source']['value'] = output
529 elif element == 'link':
530 self.entries[-1][element] = output
531 if output:
532 self.entries[-1]['links'][-1]['href'] = output
533 else:
534 self.entries[-1][element] = output
535 if self.incontent:
536 if element == 'description':
537 element = 'summary'
538 contentparams = copy.deepcopy(self.contentparams)
539 contentparams['value'] = output
540 self.entries[-1][element + '_detail'] = contentparams
541 elif self.infeed and (not self.intextinput) and (not self.inimage):
542 self.feeddata[element] = output
543 if element == 'category':
544 domain = self.feeddata['categories'][-1][0]
545 self.feeddata['categories'][-1] = (domain, output)
546 elif element == 'link':
547 self.feeddata['links'][-1]['href'] = output
548 elif self.incontent:
549 if element == 'description':
550 element = 'tagline'
551 contentparams = copy.deepcopy(self.contentparams)
552 contentparams['value'] = output
553 self.feeddata[element + '_detail'] = contentparams
554 return output
556 def _mapToStandardPrefix(self, name):
557 colonpos = name.find(':')
558 if colonpos <> -1:
559 prefix = name[:colonpos]
560 suffix = name[colonpos+1:]
561 prefix = self.namespacemap.get(prefix, prefix)
562 name = prefix + ':' + suffix
563 return name
565 def _getAttribute(self, attrsD, name):
566 return attrsD.get(self._mapToStandardPrefix(name))
568 def _save(self, key, value):
569 if value:
570 if self.inentry:
571 self.entries[-1].setdefault(key, value)
572 elif self.feeddata:
573 self.feeddata.setdefault(key, value)
575 def _start_rss(self, attrsD):
576 versionmap = {'0.91': 'rss091u',
577 '0.92': 'rss092',
578 '0.93': 'rss093',
579 '0.94': 'rss094'}
580 if not self.version:
581 attr_version = attrsD.get('version', '')
582 version = versionmap.get(attr_version)
583 if version:
584 self.version = version
585 elif attr_version.startswith('2.'):
586 self.version = 'rss20'
587 else:
588 self.version = 'rss'
590 def _start_dlhottitles(self, attrsD):
591 self.version = 'hotrss'
593 def _start_channel(self, attrsD):
594 self.infeed = 1
595 self._cdf_common(attrsD)
596 _start_feedinfo = _start_channel
598 def _cdf_common(self, attrsD):
599 if attrsD.has_key('lastmod'):
600 self._start_modified({})
601 self.elementstack[-1][-1] = attrsD['lastmod']
602 self._end_modified()
603 if attrsD.has_key('href'):
604 self._start_link({})
605 self.elementstack[-1][-1] = attrsD['href']
606 self._end_link()
608 def _start_feed(self, attrsD):
609 self.infeed = 1
610 versionmap = {'0.1': 'atom01',
611 '0.2': 'atom02',
612 '0.3': 'atom03'}
613 if not self.version:
614 attr_version = attrsD.get('version')
615 version = versionmap.get(attr_version)
616 if version:
617 self.version = version
618 else:
619 self.version = 'atom'
621 def _end_channel(self):
622 self.infeed = 0
623 _end_feed = _end_channel
625 def _start_image(self, attrsD):
626 self.inimage = 1
627 self.push('image', 0)
628 context = self._getContext()
629 context.setdefault('image', FeedParserDict())
631 def _end_image(self):
632 self.pop('image')
633 self.inimage = 0
635 def _start_textinput(self, attrsD):
636 self.intextinput = 1
637 self.push('textinput', 0)
638 context = self._getContext()
639 context.setdefault('textinput', FeedParserDict())
640 _start_textInput = _start_textinput
642 def _end_textinput(self):
643 self.pop('textinput')
644 self.intextinput = 0
645 _end_textInput = _end_textinput
647 def _start_author(self, attrsD):
648 self.inauthor = 1
649 self.push('author', 1)
650 _start_managingeditor = _start_author
651 _start_dc_author = _start_author
652 _start_dc_creator = _start_author
654 def _end_author(self):
655 self.pop('author')
656 self.inauthor = 0
657 self._sync_author_detail()
658 _end_managingeditor = _end_author
659 _end_dc_author = _end_author
660 _end_dc_creator = _end_author
662 def _start_contributor(self, attrsD):
663 self.incontributor = 1
664 context = self._getContext()
665 context.setdefault('contributors', [])
666 context['contributors'].append(FeedParserDict())
667 self.push('contributor', 0)
669 def _end_contributor(self):
670 self.pop('contributor')
671 self.incontributor = 0
673 def _start_name(self, attrsD):
674 self.push('name', 0)
676 def _end_name(self):
677 value = self.pop('name')
678 if self.inauthor:
679 self._save_author('name', value)
680 elif self.incontributor:
681 self._save_contributor('name', value)
682 elif self.intextinput:
683 context = self._getContext()
684 context['textinput']['name'] = value
686 def _start_width(self, attrsD):
687 self.push('width', 0)
689 def _end_width(self):
690 value = self.pop('width')
691 try:
692 value = int(value)
693 except:
694 value = 0
695 if self.inimage:
696 context = self._getContext()
697 context['image']['width'] = value
699 def _start_height(self, attrsD):
700 self.push('height', 0)
702 def _end_height(self):
703 value = self.pop('height')
704 try:
705 value = int(value)
706 except:
707 value = 0
708 if self.inimage:
709 context = self._getContext()
710 context['image']['height'] = value
712 def _start_url(self, attrsD):
713 self.push('url', 1)
714 _start_homepage = _start_url
715 _start_uri = _start_url
717 def _end_url(self):
718 value = self.pop('url')
719 if self.inauthor:
720 self._save_author('url', value)
721 elif self.incontributor:
722 self._save_contributor('url', value)
723 elif self.inimage:
724 context = self._getContext()
725 context['image']['url'] = value
726 elif self.intextinput:
727 context = self._getContext()
728 context['textinput']['link'] = value
729 _end_homepage = _end_url
730 _end_uri = _end_url
732 def _start_email(self, attrsD):
733 self.push('email', 0)
735 def _end_email(self):
736 value = self.pop('email')
737 if self.inauthor:
738 self._save_author('email', value)
739 elif self.incontributor:
740 self._save_contributor('email', value)
741 pass
743 def _getContext(self):
744 if self.inentry:
745 context = self.entries[-1]
746 else:
747 context = self.feeddata
748 return context
750 def _save_author(self, key, value):
751 context = self._getContext()
752 context.setdefault('author_detail', FeedParserDict())
753 context['author_detail'][key] = value
754 self._sync_author_detail()
756 def _save_contributor(self, key, value):
757 context = self._getContext()
758 context.setdefault('contributors', [FeedParserDict()])
759 context['contributors'][-1][key] = value
761 def _sync_author_detail(self, key='author'):
762 context = self._getContext()
763 detail = context.get('%s_detail' % key)
764 if detail:
765 name = detail.get('name')
766 email = detail.get('email')
767 if name and email:
768 context[key] = "%s (%s)" % (name, email)
769 elif name:
770 context[key] = name
771 elif email:
772 context[key] = email
773 else:
774 author = context.get(key)
775 if not author: return
776 emailmatch = re.search(r"""(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))""", author)
777 if not emailmatch: return
778 email = emailmatch.group(0)
779 # probably a better way to do the following, but it passes all the tests
780 author = author.replace(email, '')
781 author = author.replace('()', '')
782 author = author.strip()
783 if author and (author[0] == '('):
784 author = author[1:]
785 if author and (author[-1] == ')'):
786 author = author[:-1]
787 author = author.strip()
788 context.setdefault('%s_detail' % key, FeedParserDict())
789 context['%s_detail' % key]['name'] = author
790 context['%s_detail' % key]['email'] = email
792 def _start_tagline(self, attrsD):
793 self.incontent += 1
794 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
795 'type': attrsD.get('type', 'text/plain'),
796 'language': attrsD.get('xml:lang', self.lang),
797 'base': attrsD.get('xml:base', self.baseuri)})
798 self.push('tagline', 1)
799 _start_subtitle = _start_tagline
801 def _end_tagline(self):
802 value = self.pop('tagline')
803 self.incontent -= 1
804 self.contentparams.clear()
805 if self.infeed:
806 self.feeddata['description'] = value
807 _end_subtitle = _end_tagline
809 def _start_copyright(self, attrsD):
810 self.incontent += 1
811 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
812 'type': attrsD.get('type', 'text/plain'),
813 'language': attrsD.get('xml:lang', self.lang),
814 'base': attrsD.get('xml:base', self.baseuri)})
815 self.push('copyright', 1)
816 _start_dc_rights = _start_copyright
818 def _end_copyright(self):
819 self.pop('copyright')
820 self.incontent -= 1
821 self.contentparams.clear()
822 _end_dc_rights = _end_copyright
824 def _start_item(self, attrsD):
825 self.entries.append(FeedParserDict())
826 self.push('item', 0)
827 self.inentry = 1
828 self.guidislink = 0
829 id = self._getAttribute(attrsD, 'rdf:about')
830 if id:
831 context = self._getContext()
832 context['id'] = id
833 self._cdf_common(attrsD)
834 _start_entry = _start_item
835 _start_product = _start_item
837 def _end_item(self):
838 self.pop('item')
839 self.inentry = 0
840 _end_entry = _end_item
842 def _start_dc_language(self, attrsD):
843 self.push('language', 1)
844 _start_language = _start_dc_language
846 def _end_dc_language(self):
847 self.lang = self.pop('language')
848 _end_language = _end_dc_language
850 def _start_dc_publisher(self, attrsD):
851 self.push('publisher', 1)
852 _start_webmaster = _start_dc_publisher
854 def _end_dc_publisher(self):
855 self.pop('publisher')
856 self._sync_author_detail('publisher')
857 _end_webmaster = _end_dc_publisher
859 def _start_dcterms_issued(self, attrsD):
860 self.push('issued', 1)
861 _start_issued = _start_dcterms_issued
863 def _end_dcterms_issued(self):
864 value = self.pop('issued')
865 self._save('issued_parsed', _parse_date(value))
866 _end_issued = _end_dcterms_issued
868 def _start_dcterms_created(self, attrsD):
869 self.push('created', 1)
870 _start_created = _start_dcterms_created
872 def _end_dcterms_created(self):
873 value = self.pop('created')
874 self._save('created_parsed', _parse_date(value))
875 _end_created = _end_dcterms_created
877 def _start_dcterms_modified(self, attrsD):
878 self.push('modified', 1)
879 _start_modified = _start_dcterms_modified
880 _start_dc_date = _start_dcterms_modified
881 _start_pubdate = _start_dcterms_modified
883 def _end_dcterms_modified(self):
884 value = self.pop('modified')
885 parsed_value = _parse_date(value)
886 self._save('modified_parsed', parsed_value)
887 _end_modified = _end_dcterms_modified
888 _end_dc_date = _end_dcterms_modified
889 _end_pubdate = _end_dcterms_modified
891 def _start_expirationdate(self, attrsD):
892 self.push('expired', 1)
894 def _end_expirationdate(self):
895 self._save('expired_parsed', _parse_date(self.pop('expired')))
897 def _start_cc_license(self, attrsD):
898 self.push('license', 1)
899 value = self._getAttribute(attrsD, 'rdf:resource')
900 if value:
901 self.elementstack[-1][2].append(value)
902 self.pop('license')
904 def _start_creativecommons_license(self, attrsD):
905 self.push('license', 1)
907 def _end_creativecommons_license(self):
908 self.pop('license')
910 def _start_category(self, attrsD):
911 self.push('category', 1)
912 domain = self._getAttribute(attrsD, 'domain')
913 cats = []
914 if self.inentry:
915 cats = self.entries[-1].setdefault('categories', [])
916 elif self.infeed:
917 cats = self.feeddata.setdefault('categories', [])
918 cats.append((domain, None))
919 _start_dc_subject = _start_category
920 _start_keywords = _start_category
922 def _end_category(self):
923 self.pop('category')
924 _end_dc_subject = _end_category
925 _end_keywords = _end_category
927 def _start_cloud(self, attrsD):
928 self.feeddata['cloud'] = attrsD
930 def _start_link(self, attrsD):
931 attrsD.setdefault('rel', 'alternate')
932 attrsD.setdefault('type', 'text/html')
933 if attrsD.has_key('href'):
934 attrsD['href'] = self.resolveURI(attrsD['href'])
935 expectingText = self.infeed or self.inentry
936 if self.inentry:
937 self.entries[-1].setdefault('links', [])
938 self.entries[-1]['links'].append(FeedParserDict(attrsD))
939 elif self.infeed:
940 self.feeddata.setdefault('links', [])
941 self.feeddata['links'].append(FeedParserDict(attrsD))
942 if attrsD.has_key('href'):
943 expectingText = 0
944 if attrsD.get('type', '') in self.html_types:
945 if self.inentry:
946 self.entries[-1]['link'] = attrsD['href']
947 elif self.infeed:
948 self.feeddata['link'] = attrsD['href']
949 else:
950 self.push('link', expectingText)
951 _start_producturl = _start_link
953 def _end_link(self):
954 value = self.pop('link')
955 if self.intextinput:
956 context = self._getContext()
957 context['textinput']['link'] = value
958 if self.inimage:
959 context = self._getContext()
960 context['image']['link'] = value
961 _end_producturl = _end_link
963 def _start_guid(self, attrsD):
964 self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
965 self.push('id', 1)
967 def _end_guid(self):
968 value = self.pop('id')
969 if self.guidislink:
970 # guid acts as link, but only if "ispermalink" is not present or is "true",
971 # and only if the item doesn't already have a link element
972 self._save('link', value)
974 def _start_id(self, attrsD):
975 self.push('id', 1)
977 def _end_id(self):
978 value = self.pop('id')
980 def _start_title(self, attrsD):
981 self.incontent += 1
982 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
983 'type': attrsD.get('type', 'text/plain'),
984 'language': attrsD.get('xml:lang', self.lang),
985 'base': attrsD.get('xml:base', self.baseuri)})
986 self.push('title', self.infeed or self.inentry)
987 _start_dc_title = _start_title
989 def _end_title(self):
990 value = self.pop('title')
991 self.incontent -= 1
992 self.contentparams.clear()
993 if self.intextinput:
994 context = self._getContext()
995 context['textinput']['title'] = value
996 elif self.inimage:
997 context = self._getContext()
998 context['image']['title'] = value
999 _end_dc_title = _end_title
1001 def _start_description(self, attrsD, default_content_type='text/html'):
1002 self.incontent += 1
1003 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
1004 'type': attrsD.get('type', default_content_type),
1005 'language': attrsD.get('xml:lang', self.lang),
1006 'base': attrsD.get('xml:base', self.baseuri)})
1007 self.push('description', self.infeed or self.inentry)
1009 def _start_abstract(self, attrsD):
1010 return self._start_description(attrsD, 'text/plain')
1012 def _end_description(self):
1013 value = self.pop('description')
1014 self.incontent -= 1
1015 self.contentparams.clear()
1016 context = self._getContext()
1017 if self.intextinput:
1018 context['textinput']['description'] = value
1019 elif self.inimage:
1020 context['image']['description'] = value
1021 elif self.inentry:
1022 context['summary'] = value
1023 elif self.infeed:
1024 context['tagline'] = value
1025 _end_abstract = _end_description
1027 def _start_info(self, attrsD):
1028 self.incontent += 1
1029 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
1030 'type': attrsD.get('type', 'text/plain'),
1031 'language': attrsD.get('xml:lang', self.lang),
1032 'base': attrsD.get('xml:base', self.baseuri)})
1033 self.push('info', 1)
1035 def _end_info(self):
1036 self.pop('info')
1037 self.incontent -= 1
1038 self.contentparams.clear()
1040 def _start_generator(self, attrsD):
1041 if attrsD:
1042 if attrsD.has_key('url'):
1043 attrsD['url'] = self.resolveURI(attrsD['url'])
1044 self.feeddata['generator_detail'] = attrsD
1045 self.push('generator', 1)
1047 def _end_generator(self):
1048 value = self.pop('generator')
1049 if self.feeddata.has_key('generator_detail'):
1050 self.feeddata['generator_detail']['name'] = value
1052 def _start_admin_generatoragent(self, attrsD):
1053 self.push('generator', 1)
1054 value = self._getAttribute(attrsD, 'rdf:resource')
1055 if value:
1056 self.elementstack[-1][2].append(value)
1057 self.pop('generator')
1058 self.feeddata['generator_detail'] = FeedParserDict({"url": value})
1060 def _start_admin_errorreportsto(self, attrsD):
1061 self.push('errorreportsto', 1)
1062 value = self._getAttribute(attrsD, 'rdf:resource')
1063 if value:
1064 self.elementstack[-1][2].append(value)
1065 self.pop('errorreportsto')
1067 def _start_summary(self, attrsD):
1068 self.incontent += 1
1069 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
1070 'type': attrsD.get('type', 'text/plain'),
1071 'language': attrsD.get('xml:lang', self.lang),
1072 'base': attrsD.get('xml:base', self.baseuri)})
1073 self.push('summary', 1)
1075 def _end_summary(self):
1076 value = self.pop('summary')
1077 if self.entries:
1078 self.entries[-1]['description'] = value
1079 self.incontent -= 1
1080 self.contentparams.clear()
1082 def _start_enclosure(self, attrsD):
1083 if self.inentry:
1084 self.entries[-1].setdefault('enclosures', [])
1085 self.entries[-1]['enclosures'].append(FeedParserDict(attrsD))
1087 def _start_source(self, attrsD):
1088 if self.inentry:
1089 self.entries[-1]['source'] = FeedParserDict(attrsD)
1090 self.push('source', 1)
1092 def _end_source(self):
1093 self.pop('source')
1095 def _start_content(self, attrsD):
1096 self.incontent += 1
1097 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'xml'),
1098 'type': attrsD.get('type', 'text/plain'),
1099 'language': attrsD.get('xml:lang', self.lang),
1100 'base': attrsD.get('xml:base', self.baseuri)})
1101 self.push('content', 1)
1103 def _start_prodlink(self, attrsD):
1104 self.incontent += 1
1105 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'xml'),
1106 'type': attrsD.get('type', 'text/html'),
1107 'language': attrsD.get('xml:lang', self.lang),
1108 'base': attrsD.get('xml:base', self.baseuri)})
1109 self.push('content', 1)
1111 def _start_body(self, attrsD):
1112 self.incontent += 1
1113 self.contentparams = FeedParserDict({'mode': 'xml',
1114 'type': 'application/xhtml+xml',
1115 'language': attrsD.get('xml:lang', self.lang),
1116 'base': attrsD.get('xml:base', self.baseuri)})
1117 self.push('content', 1)
1118 _start_xhtml_body = _start_body
1120 def _start_content_encoded(self, attrsD):
1121 self.incontent += 1
1122 self.contentparams = FeedParserDict({'mode': 'escaped',
1123 'type': 'text/html',
1124 'language': attrsD.get('xml:lang', self.lang),
1125 'base': attrsD.get('xml:base', self.baseuri)})
1126 self.push('content', 1)
1127 _start_fullitem = _start_content_encoded
1129 def _end_content(self):
1130 value = self.pop('content')
1131 if self.contentparams.get('type') in (['text/plain'] + self.html_types):
1132 self._save('description', value)
1133 self.incontent -= 1
1134 self.contentparams.clear()
1135 _end_body = _end_content
1136 _end_xhtml_body = _end_content
1137 _end_content_encoded = _end_content
1138 _end_fullitem = _end_content
1139 _end_prodlink = _end_content
1141 if _XML_AVAILABLE:
1142 class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
1143 def __init__(self, baseuri, encoding):
1144 if _debug: sys.stderr.write('trying StrictFeedParser\n')
1145 xml.sax.handler.ContentHandler.__init__(self)
1146 _FeedParserMixin.__init__(self, baseuri, encoding)
1147 self.bozo = 0
1148 self.exc = None
1150 def startPrefixMapping(self, prefix, uri):
1151 self.trackNamespace(prefix, uri)
1153 def startElementNS(self, name, qname, attrs):
1154 namespace, localname = name
1155 namespace = str(namespace or '')
1156 if namespace.find('backend.userland.com/rss') <> -1:
1157 # match any backend.userland.com namespace
1158 namespace = 'http://backend.userland.com/rss'
1159 prefix = self.namespaces.get(namespace, 'unknown')
1160 if prefix:
1161 localname = prefix + ':' + localname
1162 localname = str(localname).lower()
1164 # qname implementation is horribly broken in Python 2.1 (it
1165 # doesn't report any), and slightly broken in Python 2.2 (it
1166 # doesn't report the xml: namespace). So we match up namespaces
1167 # with a known list first, and then possibly override them with
1168 # the qnames the SAX parser gives us (if indeed it gives us any
1169 # at all). Thanks to MatejC for helping me test this and
1170 # tirelessly telling me that it didn't work yet.
1171 attrsD = {}
1172 for (namespace, attrlocalname), attrvalue in attrs._attrs.items():
1173 prefix = self.namespaces.get(namespace, '')
1174 if prefix:
1175 attrlocalname = prefix + ":" + attrlocalname
1176 attrsD[str(attrlocalname).lower()] = attrvalue
1177 for qname in attrs.getQNames():
1178 attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
1179 self.unknown_starttag(localname, attrsD.items())
1181 # def resolveEntity(self, publicId, systemId):
1182 # return _StringIO()
1184 def characters(self, text):
1185 self.handle_data(text)
1187 def endElementNS(self, name, qname):
1188 namespace, localname = name
1189 namespace = str(namespace)
1190 prefix = self.namespaces.get(namespace, '')
1191 if prefix:
1192 localname = prefix + ':' + localname
1193 localname = str(localname).lower()
1194 self.unknown_endtag(localname)
1196 def error(self, exc):
1197 self.bozo = 1
1198 self.exc = exc
1200 def fatalError(self, exc):
1201 self.error(exc)
1202 raise exc
1204 class _BaseHTMLProcessor(sgmllib.SGMLParser):
1205 elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
1206 'img', 'input', 'isindex', 'link', 'meta', 'param']
1208 def __init__(self, encoding):
1209 self.encoding = encoding
1210 if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
1211 sgmllib.SGMLParser.__init__(self)
1213 def reset(self):
1214 self.pieces = []
1215 sgmllib.SGMLParser.reset(self)
1217 def feed(self, data):
1218 data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'&lt;!\1', data)
1219 data = re.sub(r'<(\S+)/>', r'<\1></\1>', data)
1220 data = data.replace('&#39;', "'")
1221 data = data.replace('&#34;', '"')
1222 if self.encoding and (type(data) == types.UnicodeType):
1223 data = data.encode(self.encoding)
1224 sgmllib.SGMLParser.feed(self, data)
1226 def normalize_attrs(self, attrs):
1227 # utility method to be called by descendants
1228 attrs = [(k.lower(), v) for k, v in attrs]
1229 # if self.encoding:
1230 # if _debug: sys.stderr.write('normalize_attrs, encoding=%s\n' % self.encoding)
1231 # attrs = [(k, v.encode(self.encoding)) for k, v in attrs]
1232 attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
1233 return attrs
1235 def unknown_starttag(self, tag, attrs):
1236 # called for each start tag
1237 # attrs is a list of (attr, value) tuples
1238 # e.g. for <pre class="screen">, tag="pre", attrs=[("class", "screen")]
1239 if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
1240 strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs])
1241 if tag in self.elements_no_end_tag:
1242 self.pieces.append("<%(tag)s%(strattrs)s />" % locals())
1243 else:
1244 self.pieces.append("<%(tag)s%(strattrs)s>" % locals())
1246 def unknown_endtag(self, tag):
1247 # called for each end tag, e.g. for </pre>, tag will be "pre"
1248 # Reconstruct the original end tag.
1249 if tag not in self.elements_no_end_tag:
1250 self.pieces.append("</%(tag)s>" % locals())
1252 def handle_charref(self, ref):
1253 # called for each character reference, e.g. for "&#160;", ref will be "160"
1254 # Reconstruct the original character reference.
1255 self.pieces.append("&#%(ref)s;" % locals())
1257 def handle_entityref(self, ref):
1258 # called for each entity reference, e.g. for "&copy;", ref will be "copy"
1259 # Reconstruct the original entity reference.
1260 self.pieces.append("&%(ref)s;" % locals())
1262 def handle_data(self, text):
1263 # called for each block of plain text, i.e. outside of any tag and
1264 # not containing any character or entity references
1265 # Store the original text verbatim.
1266 if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
1267 self.pieces.append(text)
1269 def handle_comment(self, text):
1270 # called for each HTML comment, e.g. <!-- insert Javascript code here -->
1271 # Reconstruct the original comment.
1272 self.pieces.append("<!--%(text)s-->" % locals())
1274 def handle_pi(self, text):
1275 # called for each processing instruction, e.g. <?instruction>
1276 # Reconstruct original processing instruction.
1277 self.pieces.append("<?%(text)s>" % locals())
1279 def handle_decl(self, text):
1280 # called for the DOCTYPE, if present, e.g.
1281 # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
1282 # "http://www.w3.org/TR/html4/loose.dtd">
1283 # Reconstruct original DOCTYPE
1284 self.pieces.append("<!%(text)s>" % locals())
1286 _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
1287 def _scan_name(self, i, declstartpos):
1288 rawdata = self.rawdata
1289 n = len(rawdata)
1290 if i == n:
1291 return None, -1
1292 m = self._new_declname_match(rawdata, i)
1293 if m:
1294 s = m.group()
1295 name = s.strip()
1296 if (i + len(s)) == n:
1297 return None, -1 # end of buffer
1298 return name.lower(), m.end()
1299 else:
1300 self.handle_data(rawdata)
1301 # self.updatepos(declstartpos, i)
1302 return None, -1
1304 def output(self):
1305 """Return processed HTML as a single string"""
1306 return "".join([str(p) for p in self.pieces])
1308 class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
1309 def __init__(self, baseuri, encoding):
1310 sgmllib.SGMLParser.__init__(self)
1311 _FeedParserMixin.__init__(self, baseuri, encoding)
1313 def decodeEntities(self, element, data):
1314 data = data.replace('&#60;', '&lt;')
1315 data = data.replace('&#x3c;', '&lt;')
1316 data = data.replace('&#62;', '&gt;')
1317 data = data.replace('&#x3e;', '&gt;')
1318 data = data.replace('&#38;', '&amp;')
1319 data = data.replace('&#x26;', '&amp;')
1320 data = data.replace('&#34;', '&quot;')
1321 data = data.replace('&#x22;', '&quot;')
1322 data = data.replace('&#39;', '&apos;')
1323 data = data.replace('&#x27;', '&apos;')
1324 if self.contentparams.get('mode') == 'escaped':
1325 data = data.replace('&lt;', '<')
1326 data = data.replace('&gt;', '>')
1327 data = data.replace('&amp;', '&')
1328 data = data.replace('&quot;', '"')
1329 data = data.replace('&apos;', "'")
1330 return data
1332 class _RelativeURIResolver(_BaseHTMLProcessor):
1333 relative_uris = [('a', 'href'),
1334 ('applet', 'codebase'),
1335 ('area', 'href'),
1336 ('blockquote', 'cite'),
1337 ('body', 'background'),
1338 ('del', 'cite'),
1339 ('form', 'action'),
1340 ('frame', 'longdesc'),
1341 ('frame', 'src'),
1342 ('iframe', 'longdesc'),
1343 ('iframe', 'src'),
1344 ('head', 'profile'),
1345 ('img', 'longdesc'),
1346 ('img', 'src'),
1347 ('img', 'usemap'),
1348 ('input', 'src'),
1349 ('input', 'usemap'),
1350 ('ins', 'cite'),
1351 ('link', 'href'),
1352 ('object', 'classid'),
1353 ('object', 'codebase'),
1354 ('object', 'data'),
1355 ('object', 'usemap'),
1356 ('q', 'cite'),
1357 ('script', 'src')]
1359 def __init__(self, baseuri, encoding):
1360 _BaseHTMLProcessor.__init__(self, encoding)
1361 self.baseuri = baseuri
1363 def resolveURI(self, uri):
1364 return urlparse.urljoin(self.baseuri, uri)
1366 def unknown_starttag(self, tag, attrs):
1367 attrs = self.normalize_attrs(attrs)
1368 attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
1369 _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
1371 def _resolveRelativeURIs(htmlSource, baseURI, encoding):
1372 if _debug: sys.stderr.write("entering _resolveRelativeURIs\n")
1373 p = _RelativeURIResolver(baseURI, encoding)
1374 p.feed(htmlSource)
1375 return p.output()
1377 class _HTMLSanitizer(_BaseHTMLProcessor):
1378 acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
1379 'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',
1380 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset',
1381 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input',
1382 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',
1383 'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike',
1384 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',
1385 'thead', 'tr', 'tt', 'u', 'ul', 'var']
1387 acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
1388 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing',
1389 'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols',
1390 'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled',
1391 'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',
1392 'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
1393 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
1394 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
1395 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type',
1396 'usemap', 'valign', 'value', 'vspace', 'width']
1398 unacceptable_elements_with_end_tag = ['script', 'applet']
1400 def reset(self):
1401 _BaseHTMLProcessor.reset(self)
1402 self.unacceptablestack = 0
1404 def unknown_starttag(self, tag, attrs):
1405 if not tag in self.acceptable_elements:
1406 if tag in self.unacceptable_elements_with_end_tag:
1407 self.unacceptablestack += 1
1408 return
1409 attrs = self.normalize_attrs(attrs)
1410 attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes]
1411 _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
1413 def unknown_endtag(self, tag):
1414 if not tag in self.acceptable_elements:
1415 if tag in self.unacceptable_elements_with_end_tag:
1416 self.unacceptablestack -= 1
1417 return
1418 _BaseHTMLProcessor.unknown_endtag(self, tag)
1420 def handle_pi(self, text):
1421 pass
1423 def handle_decl(self, text):
1424 pass
1426 def handle_data(self, text):
1427 if not self.unacceptablestack:
1428 _BaseHTMLProcessor.handle_data(self, text)
1430 def _sanitizeHTML(htmlSource, encoding):
1431 p = _HTMLSanitizer(encoding)
1432 p.feed(htmlSource)
1433 data = p.output()
1434 if _mxtidy and TIDY_MARKUP:
1435 nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, output_xhtml=1, numeric_entities=1, wrap=0)
1436 if data.count('<body'):
1437 data = data.split('<body', 1)[1]
1438 if data.count('>'):
1439 data = data.split('>', 1)[1]
1440 if data.count('</body'):
1441 data = data.split('</body', 1)[0]
1442 data = data.strip().replace('\r\n', '\n')
1443 return data
1445 class _FeedURLHandler(urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
1446 def http_error_default(self, req, fp, code, msg, headers):
1447 if ((code / 100) == 3) and (code != 304):
1448 return self.http_error_302(req, fp, code, msg, headers)
1449 infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1450 infourl.status = code
1451 return infourl
1453 def http_error_302(self, req, fp, code, msg, headers):
1454 if headers.dict.has_key('location'):
1455 infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
1456 else:
1457 infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1458 infourl.status = code
1459 return infourl
1461 def http_error_301(self, req, fp, code, msg, headers):
1462 if headers.dict.has_key('location'):
1463 infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
1464 else:
1465 infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1466 infourl.status = code
1467 return infourl
1469 http_error_300 = http_error_302
1470 http_error_307 = http_error_302
1472 def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers):
1473 """URL, filename, or string --> stream
1475 This function lets you define parsers that take any input source
1476 (URL, pathname to local or network file, or actual data as a string)
1477 and deal with it in a uniform manner. Returned object is guaranteed
1478 to have all the basic stdio read methods (read, readline, readlines).
1479 Just .close() the object when you're done with it.
1481 If the etag argument is supplied, it will be used as the value of an
1482 If-None-Match request header.
1484 If the modified argument is supplied, it must be a tuple of 9 integers
1485 as returned by gmtime() in the standard Python time module. This MUST
1486 be in GMT (Greenwich Mean Time). The formatted date/time will be used
1487 as the value of an If-Modified-Since request header.
1489 If the agent argument is supplied, it will be used as the value of a
1490 User-Agent request header.
1492 If the referrer argument is supplied, it will be used as the value of a
1493 Referer[sic] request header.
1495 If handlers is supplied, it is a list of handlers used to build a
1496 urllib2 opener.
1497 """
1499 if hasattr(url_file_stream_or_string, "read"):
1500 return url_file_stream_or_string
1502 if url_file_stream_or_string == "-":
1503 return sys.stdin
1505 if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'):
1506 if not agent:
1507 agent = USER_AGENT
1508 # test for inline user:password for basic auth
1509 auth = None
1510 if base64:
1511 urltype, rest = urllib.splittype(url_file_stream_or_string)
1512 realhost, rest = urllib.splithost(rest)
1513 if realhost:
1514 user_passwd, realhost = urllib.splituser(realhost)
1515 if user_passwd:
1516 url_file_stream_or_string = "%s://%s%s" % (urltype, realhost, rest)
1517 auth = base64.encodestring(user_passwd).strip()
1518 # try to open with urllib2 (to use optional headers)
1519 request = urllib2.Request(url_file_stream_or_string)
1520 request.add_header("User-Agent", agent)
1521 if etag:
1522 request.add_header("If-None-Match", etag)
1523 if modified:
1524 # format into an RFC 1123-compliant timestamp. We can't use
1525 # time.strftime() since the %a and %b directives can be affected
1526 # by the current locale, but RFC 2616 states that dates must be
1527 # in English.
1528 short_weekdays = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
1529 months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
1530 request.add_header("If-Modified-Since", "%s, %02d %s %04d %02d:%02d:%02d GMT" % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
1531 if referrer:
1532 request.add_header("Referer", referrer)
1533 if gzip and zlib:
1534 request.add_header("Accept-encoding", "gzip, deflate")
1535 elif gzip:
1536 request.add_header("Accept-encoding", "gzip")
1537 elif zlib:
1538 request.add_header("Accept-encoding", "deflate")
1539 else:
1540 request.add_header("Accept-encoding", "")
1541 if auth:
1542 request.add_header("Authorization", "Basic %s" % auth)
1543 if ACCEPT_HEADER:
1544 request.add_header("Accept", ACCEPT_HEADER)
1545 opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers))
1546 opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
1547 try:
1548 return opener.open(request)
1549 finally:
1550 opener.close() # JohnD
1552 # try to open with native open function (if url_file_stream_or_string is a filename)
1553 try:
1554 return open(url_file_stream_or_string)
1555 except:
1556 pass
1558 # treat url_file_stream_or_string as string
1559 return _StringIO(str(url_file_stream_or_string))
1561 # W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by
1562 # Drake and licensed under the Python license. Removed all range checking
1563 # for month, day, hour, minute, and second, since mktime will normalize
1564 # these later
1565 def _w3dtf_parse(s):
1566 def __extract_date(m):
1567 year = int(m.group("year"))
1568 if year < 100:
1569 year = 100 * int(time.gmtime()[0] / 100) + int(year)
1570 if year < 1000:
1571 return 0, 0, 0
1572 julian = m.group("julian")
1573 if julian:
1574 julian = int(julian)
1575 month = julian / 30 + 1
1576 day = julian % 30 + 1
1577 jday = None
1578 while jday != julian:
1579 t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))
1580 jday = time.gmtime(t)[-2]
1581 diff = abs(jday - julian)
1582 if jday > julian:
1583 if diff < day:
1584 day = day - diff
1585 else:
1586 month = month - 1
1587 day = 31
1588 elif jday < julian:
1589 if day + diff < 28:
1590 day = day + diff
1591 else:
1592 month = month + 1
1593 return year, month, day
1594 month = m.group("month")
1595 day = 1
1596 if month is None:
1597 month = 1
1598 else:
1599 month = int(month)
1600 day = m.group("day")
1601 if day:
1602 day = int(day)
1603 else:
1604 day = 1
1605 return year, month, day
1607 def __extract_time(m):
1608 if not m:
1609 return 0, 0, 0
1610 hours = m.group("hours")
1611 if not hours:
1612 return 0, 0, 0
1613 hours = int(hours)
1614 minutes = int(m.group("minutes"))
1615 seconds = m.group("seconds")
1616 if seconds:
1617 seconds = int(seconds)
1618 else:
1619 seconds = 0
1620 return hours, minutes, seconds
1622 def __extract_tzd(m):
1623 """Return the Time Zone Designator as an offset in seconds from UTC."""
1624 if not m:
1625 return 0
1626 tzd = m.group("tzd")
1627 if not tzd:
1628 return 0
1629 if tzd == "Z":
1630 return 0
1631 hours = int(m.group("tzdhours"))
1632 minutes = m.group("tzdminutes")
1633 if minutes:
1634 minutes = int(minutes)
1635 else:
1636 minutes = 0
1637 offset = (hours*60 + minutes) * 60
1638 if tzd[0] == "+":
1639 return -offset
1640 return offset
1642 __date_re = ("(?P<year>\d\d\d\d)"
1643 "(?:(?P<dsep>-|)"
1644 "(?:(?P<julian>\d\d\d)"
1645 "|(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?))?")
1646 __tzd_re = "(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)"
1647 __tzd_rx = re.compile(__tzd_re)
1648 __time_re = ("(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)"
1649 "(?:(?P=tsep)(?P<seconds>\d\d(?:[.,]\d+)?))?"
1650 + __tzd_re)
1651 __datetime_re = "%s(?:T%s)?" % (__date_re, __time_re)
1652 __datetime_rx = re.compile(__datetime_re)
1653 m = __datetime_rx.match(s)
1654 if m is None or m.group() != s:
1655 return None
1656 gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)
1657 if gmt[0] == 0: return
1658 return time.mktime(gmt) + __extract_tzd(m) - time.timezone
1660 # Additional ISO-8601 date parsing routines written by Fazal Majid
1661 # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
1662 # parser is beyond the scope of feedparser and would be a worthwhile addition
1663 # to the Python library
1664 # A single regular expression cannot parse ISO 8601 date formats into groups
1665 # as the standard is highly irregular (for instance is 030104 2003-01-04 or
1666 # 0301-04-01), so we use templates instead
1667 # Please note the order in templates is significant because we need a
1668 # greedy match
1669 _iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO',
1670 'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
1671 '-YY-?MM', '-OOO', '-YY',
1672 '--MM-?DD', '--MM',
1673 '---DD',
1674 'CC', '']
1675 _iso8601_re = [
1676 tmpl.replace(
1677 'YYYY', r'(?P<year>\d{4})').replace(
1678 'YY', r'(?P<year>\d\d)').replace(
1679 'MM', r'(?P<month>[01]\d)').replace(
1680 'DD', r'(?P<day>[0123]\d)').replace(
1681 'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
1682 'CC', r'(?P<century>\d\d$)')
1683 + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
1684 + r'(:(?P<second>\d{2}))?'
1685 + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
1686 for tmpl in _iso8601_tmpl]
1687 del tmpl
1689 _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
1690 del regex
1692 # rfc822.py defines several time zones, but we define some extra ones.
1693 # "ET" is equivalent to "EST", etc.
1694 _additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
1695 rfc822._timezones.update(_additional_timezones)
1697 # utf-8 sequences for some Korean characters seen in pubDate
1698 _korean_year = u'\xEB\x85\x84'
1699 _korean_month = u'\xEC\x9B\x94'
1700 _korean_day = u'\xEC\x9D\xBC'
1701 _korean_date_1_re = \
1702 re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
1703 (_korean_year, _korean_month, _korean_day))
1705 def _parse_date(date):
1706 """Parses a variety of date formats into a tuple of 9 integers"""
1707 try:
1708 if type(date) == types.UnicodeType:
1709 date = date.encode('utf-8')
1710 if type(date) != types.StringType: return
1712 # munge Korean dates into usable format
1713 match = _korean_date_1_re.match(date)
1714 if match:
1715 date = "%s-%s-%sT%s:%s:%s+09:00" % match.groups()[:6]
1717 # try the standard rfc822 library, which handles
1718 # RFC822, RFC1123, RFC2822, and asctime
1719 tm = rfc822.parsedate_tz(date)
1720 if tm:
1721 return time.gmtime(rfc822.mktime_tz(tm))
1723 # not a RFC2822 date, try W3DTF profile of ISO-8601
1724 try:
1725 tm = _w3dtf_parse(date)
1726 except ValueError:
1727 tm = None
1728 if tm:
1729 return time.gmtime(tm)
1731 # try various non-W3DTF ISO-8601-compatible formats like 20040105
1732 m = None
1733 for _iso8601_match in _iso8601_matches:
1734 m = _iso8601_match(date)
1735 if m: break
1736 if not m: return
1737 if m.span() == (0, 0): return
1738 params = m.groupdict()
1739 ordinal = params.get("ordinal", 0)
1740 if ordinal:
1741 ordinal = int(ordinal)
1742 else:
1743 ordinal = 0
1744 year = params.get("year", "--")
1745 if not year or year == "--":
1746 year = time.gmtime()[0]
1747 elif len(year) == 2:
1748 # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
1749 year = 100 * int(time.gmtime()[0] / 100) + int(year)
1750 else:
1751 year = int(year)
1752 month = params.get("month", "-")
1753 if not month or month == "-":
1754 # ordinals are NOT normalized by mktime, we simulate them
1755 # by setting month=1, day=ordinal
1756 if ordinal:
1757 month = 1
1758 else:
1759 month = time.gmtime()[1]
1760 month = int(month)
1761 day = params.get("day", 0)
1762 if not day:
1763 # see above
1764 if ordinal:
1765 day = ordinal
1766 elif params.get("century", 0) or \
1767 params.get("year", 0) or params.get("month", 0):
1768 day = 1
1769 else:
1770 day = time.gmtime()[2]
1771 else:
1772 day = int(day)
1773 # special case of the century - is the first year of the 21st century
1774 # 2000 or 2001 ? The debate goes on...
1775 if "century" in params.keys():
1776 year = (int(params["century"]) - 1) * 100 + 1
1777 # in ISO 8601 most fields are optional
1778 for field in ["hour", "minute", "second", "tzhour", "tzmin"]:
1779 if not params.get(field, None):
1780 params[field] = 0
1781 hour = int(params.get("hour", 0))
1782 minute = int(params.get("minute", 0))
1783 second = int(params.get("second", 0))
1784 # weekday is normalized by mktime(), we can ignore it
1785 weekday = 0
1786 # daylight savings is complex, but not needed for feedparser's purposes
1787 # as time zones, if specified, include mention of whether it is active
1788 # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
1789 # and most implementations have DST bugs
1790 daylight_savings_flag = 0
1791 tm = [year, month, day, hour, minute, second, weekday,
1792 ordinal, daylight_savings_flag]
1793 # ISO 8601 time zone adjustments
1794 tz = params.get("tz")
1795 if tz and tz != "Z":
1796 if tz[0] == "-":
1797 tm[3] += int(params.get("tzhour", 0))
1798 tm[4] += int(params.get("tzmin", 0))
1799 elif tz[0] == "+":
1800 tm[3] -= int(params.get("tzhour", 0))
1801 tm[4] -= int(params.get("tzmin", 0))
1802 else:
1803 return None
1804 # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
1805 # which is guaranteed to normalize d/m/y/h/m/s
1806 # many implementations have bugs, but we'll pretend they don't
1807 return time.localtime(time.mktime(tm))
1808 except:
1809 return None
1811 def _getCharacterEncoding(http_headers, xml_data):
1812 """Get the character encoding of the XML document
1814 http_headers is a dictionary
1815 xml_data is a raw string (not Unicode)
1817 This is so much trickier than it sounds,
1818 it's not even funny. According to RFC 3023 ("XML Media Types"), if
1819 the HTTP Content-Type is application/xml, application/*+xml,
1820 application/xml-external-parsed-entity, or application/xml-dtd,
1821 the encoding given in the charset parameter of the HTTP Content-Type
1822 takes precedence over the encoding given in the XML prefix within the
1823 document, and defaults to "utf-8" if neither are specified. But, if
1824 the HTTP Content-Type is text/xml, text/*+xml, or
1825 text/xml-external-parsed-entity, the encoding given in the XML prefix
1826 within the document is ALWAYS IGNORED and only the encoding given in
1827 the charset parameter of the HTTP Content-Type header should be
1828 respected, and it defaults to "us-ascii" if not specified.
1830 Furthermore, discussion on the atom-syntax mailing list with the
1831 author of RFC 3023 leads me to the conclusion that any document
1832 served with a Content-Type of text/* and no charset parameter
1833 must be treated as us-ascii. (We now do this.) And also that it
1834 must always be flagged as non-well-formed. (We do not do this.)
1836 If Content-Type is unspecified (input was local file or non-HTTP source)
1837 or unrecognized (server just got it totally wrong), then go by the
1838 encoding given in the XML prefix of the document and default to
1839 "utf-8" as per the XML specification. This part is probably wrong,
1840 as HTTP defaults to "iso-8859-1" if no Content-Type is specified.
1842 Also, the default Content-Type and well-formedness of XML documents
1843 served as wacky types like "application/octet-stream" is still under
1844 discussion.
1845 """
1847 def _parseHTTPContentType(content_type):
1848 """takes HTTP Content-Type header and returns (content type, charset)
1850 If no charset is specified, returns (content type, '')
1851 If no content type is specified, returns ('', '')
1852 Both return parameters are guaranteed to be lowercase strings
1853 """
1854 if not content_type:
1855 return '', ''
1856 content_type = content_type.strip()
1857 paramstr = content_type.split(';')[1:]
1858 if not paramstr:
1859 return content_type, ''
1860 content_type = content_type.split(';', 1)[0].strip().lower()
1861 if not paramstr[0]:
1862 # declaration like "text/xml;" (note ending semicolon)
1863 # dunno if this is malformed but it sure was hard to track down
1864 return content_type, ''
1865 import string
1866 if not paramstr[0].count('='):
1867 # malformed declaration like "text/xml; charset:utf-8" (note : instead of =)
1868 return content_type, ''
1869 params = dict([map(string.lower, map(string.strip, p.strip().split('=', 1))) for p in paramstr])
1870 charset = params.get('charset')
1871 if not charset:
1872 return content_type, ''
1873 if charset[0] in ('"', "'"):
1874 charset = charset[1:]
1875 if charset and charset[-1] in ('"', "'"):
1876 charset = charset[:-1]
1877 charset = charset.strip()
1878 return content_type, charset
1880 sniffed_xml_encoding = ''
1881 xml_encoding = ''
1882 true_encoding = ''
1883 http_content_type, http_encoding = _parseHTTPContentType(http_headers.get("content-type"))
1884 # Must sniff for non-ASCII-compatible character encodings before
1885 # searching for XML declaration. This heuristic is defined in
1886 # section F of the XML specification:
1887 # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
1888 try:
1889 if xml_data[:4] == '\x4c\x6f\xa7\x94':
1890 # EBCDIC
1891 xml_data = _ebcdic_to_ascii(xml_data)
1892 elif xml_data[:4] == '\x00\x3c\x00\x3f':
1893 # UTF-16BE
1894 sniffed_xml_encoding = 'utf-16be'
1895 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1896 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'):
1897 # UTF-16BE with BOM
1898 sniffed_xml_encoding = 'utf-16be'
1899 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1900 elif xml_data[:4] == '\x3c\x00\x3f\x00':
1901 # UTF-16LE
1902 sniffed_xml_encoding = 'utf-16le'
1903 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1904 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'):
1905 # UTF-16LE with BOM
1906 sniffed_xml_encoding = 'utf-16le'
1907 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1908 elif xml_data[:4] == '\x00\x00\x00\x3c':
1909 # UTF-32BE
1910 sniffed_xml_encoding = 'utf-32be'
1911 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1912 elif xml_data[:4] == '\x3c\x00\x00\x00':
1913 # UTF-32LE
1914 sniffed_xml_encoding = 'utf-32le'
1915 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1916 elif xml_data[:4] == '\x00\x00\xfe\xff':
1917 # UTF-32BE with BOM
1918 sniffed_xml_encoding = 'utf-32be'
1919 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1920 elif xml_data[:4] == '\xff\xfe\x00\x00':
1921 # UTF-32LE with BOM
1922 sniffed_xml_encoding = 'utf-32le'
1923 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1924 elif xml_data[:3] == '\xef\xbb\xbf':
1925 # UTF-8 with BOM
1926 sniffed_xml_encoding = 'utf-8'
1927 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1928 else:
1929 # ASCII-compatible
1930 pass
1931 xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
1932 except:
1933 xml_encoding_match = None
1934 if xml_encoding_match:
1935 xml_encoding = xml_encoding_match.groups()[0].lower()
1936 if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):
1937 xml_encoding = sniffed_xml_encoding
1938 if (http_content_type == 'application/xml') or \
1939 (http_content_type == 'application/xml-dtd') or \
1940 (http_content_type == 'application/xml-external-parsed-entity') or \
1941 (http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
1942 if http_encoding:
1943 true_encoding = http_encoding
1944 elif xml_encoding:
1945 true_encoding = xml_encoding
1946 else:
1947 true_encoding = 'utf-8'
1948 elif (http_content_type == 'text/xml') or \
1949 (http_content_type == 'text/xml-external-parsed-entity') or \
1950 (http_content_type.startswith('text/')):# and http_content_type.endswith('+xml')):
1951 if http_encoding:
1952 true_encoding = http_encoding
1953 else:
1954 true_encoding = 'us-ascii'
1955 elif http_headers and (not http_headers.has_key('content-type')):
1956 true_encoding = xml_encoding or 'utf-8' #'iso-8859-1'
1957 else:
1958 true_encoding = xml_encoding or 'utf-8'
1959 return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding
1961 def _toUTF8(data, encoding):
1962 """Changes an XML data stream on the fly to specify a new encoding
1964 data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
1965 encoding is a string recognized by encodings.aliases
1966 """
1967 if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding)
1968 # strip Byte Order Mark (if present)
1969 if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'):
1970 if _debug:
1971 sys.stderr.write('stripping BOM\n')
1972 if encoding != 'utf-16be':
1973 sys.stderr.write('trying utf-16be instead\n')
1974 encoding = 'utf-16be'
1975 data = data[2:]
1976 elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'):
1977 if _debug:
1978 sys.stderr.write('stripping BOM\n')
1979 if encoding != 'utf-16le':
1980 sys.stderr.write('trying utf-16le instead\n')
1981 encoding = 'utf-16le'
1982 data = data[2:]
1983 elif data[:3] == '\xef\xbb\xbf':
1984 if _debug:
1985 sys.stderr.write('stripping BOM\n')
1986 if encoding != 'utf-8':
1987 sys.stderr.write('trying utf-8 instead\n')
1988 encoding = 'utf-8'
1989 data = data[3:]
1990 elif data[:4] == '\x00\x00\xfe\xff':
1991 if _debug:
1992 sys.stderr.write('stripping BOM\n')
1993 if encoding != 'utf-32be':
1994 sys.stderr.write('trying utf-32be instead\n')
1995 encoding = 'utf-32be'
1996 data = data[4:]
1997 elif data[:4] == '\xff\xfe\x00\x00':
1998 if _debug:
1999 sys.stderr.write('stripping BOM\n')
2000 if encoding != 'utf-32le':
2001 sys.stderr.write('trying utf-32le instead\n')
2002 encoding = 'utf-32le'
2003 data = data[4:]
2004 newdata = unicode(data, encoding)
2005 if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding)
2006 declmatch = re.compile('^<\?xml[^>]*?>')
2007 newdecl = """<?xml version='1.0' encoding='utf-8'?>"""
2008 if declmatch.search(newdata):
2009 newdata = declmatch.sub(newdecl, newdata)
2010 else:
2011 newdata = newdecl + u'\n' + newdata
2012 return newdata.encode("utf-8")
2014 def _stripDoctype(data):
2015 """Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
2017 rss_version may be "rss091n" or None
2018 stripped_data is the same XML document, minus the DOCTYPE
2019 """
2020 entity_pattern = re.compile(r'<!ENTITY([^>]*?)>', re.MULTILINE)
2021 data = entity_pattern.sub('', data)
2022 doctype_pattern = re.compile(r'<!DOCTYPE([^>]*?)>', re.MULTILINE)
2023 doctype_results = doctype_pattern.findall(data)
2024 doctype = doctype_results and doctype_results[0] or ''
2025 if doctype.lower().count('netscape'):
2026 version = 'rss091n'
2027 else:
2028 version = None
2029 data = doctype_pattern.sub('', data)
2030 return version, data
2032 def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]):
2033 """Parse a feed from a URL, file, stream, or string"""
2035 result = FeedParserDict()
2036 result['feed'] = FeedParserDict()
2037 result['entries'] = []
2038 if _XML_AVAILABLE:
2039 result['bozo'] = 0
2040 if type(handlers) == types.InstanceType:
2041 handlers = [handlers]
2042 try:
2043 f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers)
2044 data = f.read()
2045 except Exception, e:
2046 result['bozo'] = 1
2047 result['bozo_exception'] = e
2048 data = ''
2049 f = None
2051 # if feed is gzip-compressed, decompress it
2052 if f and data and hasattr(f, "headers"):
2053 if gzip and f.headers.get('content-encoding', '') == 'gzip':
2054 try:
2055 data = gzip.GzipFile(fileobj=_StringIO(data)).read()
2056 except Exception, e:
2057 # Some feeds claim to be gzipped but they're not, so
2058 # we get garbage. Ideally, we should re-request the
2059 # feed without the "Accept-encoding: gzip" header,
2060 # but we don't.
2061 result['bozo'] = 1
2062 result['bozo_exception'] = e
2063 data = ''
2064 elif zlib and f.headers.get('content-encoding', '') == 'deflate':
2065 try:
2066 data = zlib.decompress(data, -zlib.MAX_WBITS)
2067 except Exception, e:
2068 result['bozo'] = 1
2069 result['bozo_exception'] = e
2070 data = ''
2072 # save HTTP headers
2073 if hasattr(f, "info"):
2074 info = f.info()
2075 result["etag"] = info.getheader("ETag")
2076 last_modified = info.getheader("Last-Modified")
2077 if last_modified:
2078 result["modified"] = _parse_date(last_modified)
2079 if hasattr(f, "url"):
2080 result["url"] = f.url
2081 result["status"] = 200
2082 if hasattr(f, "status"):
2083 result["status"] = f.status
2084 if hasattr(f, "headers"):
2085 result["headers"] = f.headers.dict
2086 if hasattr(f, "close"):
2087 f.close()
2089 # there are three encodings to keep track of:
2090 # - xml_encoding is the encoding declared in the <?xml declaration
2091 # - http_encoding is the encoding declared in the Content-Type HTTP header
2092 # - result['encoding'] is the actual encoding, as specified by RFC 3023
2093 result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding = \
2094 _getCharacterEncoding(result.get("headers", {}), data)
2095 result['version'], data = _stripDoctype(data)
2096 baseuri = result.get('headers', {}).get('content-location', result.get('url'))
2098 # if server sent 304, we're done
2099 if result.get("status", 0) == 304:
2100 result['version'] = ''
2101 result['debug_message'] = "The feed has not changed since you last checked, " + \
2102 "so the server sent no data. This is a feature, not a bug!"
2103 return result
2105 # if there was a problem downloading, we're done
2106 if not data:
2107 return result
2109 # determine character encoding
2110 use_strict_parser = 0
2111 known_encoding = 0
2112 tried_encodings = []
2113 for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding, 'utf-8', 'windows-1252'):
2114 if proposed_encoding in tried_encodings: continue
2115 if not proposed_encoding: continue
2116 try:
2117 data = _toUTF8(data, proposed_encoding)
2118 known_encoding = 1
2119 use_strict_parser = 1
2120 break
2121 except:
2122 pass
2123 tried_encodings.append(proposed_encoding)
2124 if not known_encoding:
2125 result['bozo'] = 1
2126 result['bozo_exception'] = CharacterEncodingUnknown( \
2127 "document encoding unknown, I tried " + \
2128 "%s, %s, utf-8, and windows-1252 but nothing worked" % \
2129 (result['encoding'], xml_encoding))
2130 result['encoding'] = ''
2131 elif proposed_encoding != result['encoding']:
2132 result['bozo'] = 1
2133 result['bozo_exception'] = CharacterEncodingOverride( \
2134 "documented declared as %s, but parsed as %s" % \
2135 (result['encoding'], proposed_encoding))
2136 result['encoding'] = proposed_encoding
2138 if not _XML_AVAILABLE:
2139 use_strict_parser = 0
2140 if use_strict_parser:
2141 # initialize the SAX parser
2142 feedparser = _StrictFeedParser(baseuri, 'utf-8')
2143 saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
2144 saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
2145 saxparser.setContentHandler(feedparser)
2146 saxparser.setErrorHandler(feedparser)
2147 source = xml.sax.xmlreader.InputSource()
2148 source.setByteStream(_StringIO(data))
2149 if hasattr(saxparser, '_ns_stack'):
2150 # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
2151 # PyXML doesn't have this problem, and it doesn't have _ns_stack either
2152 saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
2153 try:
2154 saxparser.parse(source)
2155 except Exception, e:
2156 if _debug:
2157 import traceback
2158 traceback.print_stack()
2159 traceback.print_exc()
2160 sys.stderr.write('xml parsing failed\n')
2161 result['bozo'] = 1
2162 result['bozo_exception'] = feedparser.exc or e
2163 use_strict_parser = 0
2164 if not use_strict_parser:
2165 feedparser = _LooseFeedParser(baseuri, known_encoding and 'utf-8' or '')
2166 feedparser.feed(data)
2167 result['feed'] = feedparser.feeddata
2168 result['entries'] = feedparser.entries
2169 result['version'] = result['version'] or feedparser.version
2170 if _debug:
2171 import pprint
2172 pprint.pprint(result)
2173 return result
2175 if __name__ == '__main__':
2176 if not sys.argv[1:]:
2177 print __doc__
2178 sys.exit(0)
2179 else:
2180 urls = sys.argv[1:]
2181 from pprint import pprint
2182 for url in urls:
2183 print url
2184 print
2185 result = parse(url)
2186 pprint(result)
2187 print
2189 #REVISION HISTORY
2190 #1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,
2191 # added Simon Fell's test suite
2192 #1.1 - 9/29/2002 - MAP - fixed infinite loop on incomplete CDATA sections
2193 #2.0 - 10/19/2002
2194 # JD - use inchannel to watch out for image and textinput elements which can
2195 # also contain title, link, and description elements
2196 # JD - check for isPermaLink="false" attribute on guid elements
2197 # JD - replaced openAnything with open_resource supporting ETag and
2198 # If-Modified-Since request headers
2199 # JD - parse now accepts etag, modified, agent, and referrer optional
2200 # arguments
2201 # JD - modified parse to return a dictionary instead of a tuple so that any
2202 # etag or modified information can be returned and cached by the caller
2203 #2.0.1 - 10/21/2002 - MAP - changed parse() so that if we don't get anything
2204 # because of etag/modified, return the old etag/modified to the caller to
2205 # indicate why nothing is being returned
2206 #2.0.2 - 10/21/2002 - JB - added the inchannel to the if statement, otherwise its
2207 # useless. Fixes the problem JD was addressing by adding it.
2208 #2.1 - 11/14/2002 - MAP - added gzip support
2209 #2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent.
2210 # start_admingeneratoragent is an example of how to handle elements with
2211 # only attributes, no content.
2212 #2.3 - 6/11/2003 - MAP - added USER_AGENT for default (if caller doesn't specify);
2213 # also, make sure we send the User-Agent even if urllib2 isn't available.
2214 # Match any variation of backend.userland.com/rss namespace.
2215 #2.3.1 - 6/12/2003 - MAP - if item has both link and guid, return both as-is.
2216 #2.4 - 7/9/2003 - MAP - added preliminary Pie/Atom/Echo support based on Sam Ruby's
2217 # snapshot of July 1 <http://www.intertwingly.net/blog/1506.html>; changed
2218 # project name
2219 #2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree);
2220 # removed unnecessary urllib code -- urllib2 should always be available anyway;
2221 # return actual url, status, and full HTTP headers (as result['url'],
2222 # result['status'], and result['headers']) if parsing a remote feed over HTTP --
2223 # this should pass all the HTTP tests at <http://diveintomark.org/tests/client/http/>;
2224 # added the latest namespace-of-the-week for RSS 2.0
2225 #2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom
2226 # User-Agent (otherwise urllib2 sends two, which confuses some servers)
2227 #2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for
2228 # inline <xhtml:body> and <xhtml:div> as used in some RSS 2.0 feeds
2229 #2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or
2230 # textInput, and also to return the character encoding (if specified)
2231 #2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking
2232 # nested divs within content (JohnD); fixed missing sys import (JohanS);
2233 # fixed regular expression to capture XML character encoding (Andrei);
2234 # added support for Atom 0.3-style links; fixed bug with textInput tracking;
2235 # added support for cloud (MartijnP); added support for multiple
2236 # category/dc:subject (MartijnP); normalize content model: "description" gets
2237 # description (which can come from description, summary, or full content if no
2238 # description), "content" gets dict of base/language/type/value (which can come
2239 # from content:encoded, xhtml:body, content, or fullitem);
2240 # fixed bug matching arbitrary Userland namespaces; added xml:base and xml:lang
2241 # tracking; fixed bug tracking unknown tags; fixed bug tracking content when
2242 # <content> element is not in default namespace (like Pocketsoap feed);
2243 # resolve relative URLs in link, guid, docs, url, comments, wfw:comment,
2244 # wfw:commentRSS; resolve relative URLs within embedded HTML markup in
2245 # description, xhtml:body, content, content:encoded, title, subtitle,
2246 # summary, info, tagline, and copyright; added support for pingback and
2247 # trackback namespaces
2248 #2.7 - 1/5/2004 - MAP - really added support for trackback and pingback
2249 # namespaces, as opposed to 2.6 when I said I did but didn't really;
2250 # sanitize HTML markup within some elements; added mxTidy support (if
2251 # installed) to tidy HTML markup within some elements; fixed indentation
2252 # bug in _parse_date (FazalM); use socket.setdefaulttimeout if available
2253 # (FazalM); universal date parsing and normalization (FazalM): 'created', modified',
2254 # 'issued' are parsed into 9-tuple date format and stored in 'created_parsed',
2255 # 'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified'
2256 # and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa
2257 #2.7.1 - 1/9/2004 - MAP - fixed bug handling &quot; and &apos;. fixed memory
2258 # leak not closing url opener (JohnD); added dc:publisher support (MarekK);
2259 # added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK)
2260 #2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed <br/> tags in
2261 # encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL);
2262 # fixed relative URI processing for guid (skadz); added ICBM support; added
2263 # base64 support
2264 #2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many
2265 # blogspot.com sites); added _debug variable
2266 #2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing
2267 #3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available);
2268 # added several new supported namespaces; fixed bug tracking naked markup in
2269 # description; added support for enclosure; added support for source; re-added
2270 # support for cloud which got dropped somehow; added support for expirationDate
2271 #3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking
2272 # xml:base URI, one for documents that don't define one explicitly and one for
2273 # documents that define an outer and an inner xml:base that goes out of scope
2274 # before the end of the document
2275 #3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level
2276 #3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result["version"]
2277 # will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized;
2278 # added support for creativeCommons:license and cc:license; added support for
2279 # full Atom content model in title, tagline, info, copyright, summary; fixed bug
2280 # with gzip encoding (not always telling server we support it when we do)
2281 #3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail
2282 # (dictionary of "name", "url", "email"); map author to author_detail if author
2283 # contains name + email address
2284 #3.0b8 - 1/28/2004 - MAP - added support for contributor
2285 #3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added
2286 # support for summary
2287 #3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from
2288 # xml.util.iso8601
2289 #3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain
2290 # dangerous markup; fiddled with decodeEntities (not right); liberalized
2291 # date parsing even further
2292 #3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right);
2293 # added support to Atom 0.2 subtitle; added support for Atom content model
2294 # in copyright; better sanitizing of dangerous HTML elements with end tags
2295 # (script, frameset)
2296 #3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img,
2297 # etc.) in embedded markup, in either HTML or XHTML form (<br>, <br/>, <br />)
2298 #3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under
2299 # Python 2.1
2300 #3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS;
2301 # fixed bug capturing author and contributor URL; fixed bug resolving relative
2302 # links in author and contributor URL; fixed bug resolvin relative links in
2303 # generator URL; added support for recognizing RSS 1.0; passed Simon Fell's
2304 # namespace tests, and included them permanently in the test suite with his
2305 # permission; fixed namespace handling under Python 2.1
2306 #3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15)
2307 #3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023
2308 #3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei);
2309 # use libxml2 (if available)
2310 #3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author
2311 # name was in parentheses; removed ultra-problematic mxTidy support; patch to
2312 # workaround crash in PyXML/expat when encountering invalid entities
2313 # (MarkMoraes); support for textinput/textInput
2314 #3.0b20 - 4/7/2004 - MAP - added CDF support
2315 #3.0b21 - 4/14/2004 - MAP - added Hot RSS support
2316 #3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in
2317 # results dict; changed results dict to allow getting values with results.key
2318 # as well as results[key]; work around embedded illformed HTML with half
2319 # a DOCTYPE; work around malformed Content-Type header; if character encoding
2320 # is wrong, try several common ones before falling back to regexes (if this
2321 # works, bozo_exception is set to CharacterEncodingOverride); fixed character
2322 # encoding issues in BaseHTMLProcessor by tracking encoding and converting
2323 # from Unicode to raw strings before feeding data to sgmllib.SGMLParser;
2324 # convert each value in results to Unicode (if possible), even if using
2325 # regex-based parsing
2326 #3.0b23 - 4/21/2004 - MAP - fixed UnicodeDecodeError for feeds that contain
2327 # high-bit characters in attributes in embedded HTML in description (thanks
2328 # Thijs van de Vossen); moved guid, date, and date_parsed to mapped keys in
2329 # FeedParserDict; tweaked FeedParserDict.has_key to return True if asking
2330 # about a mapped key
2331 #3.0fc1 - 4/23/2004 - MAP - made results.entries[0].links[0] and
2332 # results.entries[0].enclosures[0] into FeedParserDict; fixed typo that could
2333 # cause the same encoding to be tried twice (even if it failed the first time);
2334 # fixed DOCTYPE stripping when DOCTYPE contained entity declarations;
2335 # better textinput and image tracking in illformed RSS 1.0 feeds
2336 #3.0fc2 - 5/10/2004 - MAP - added and passed Sam's amp tests; added and passed
2337 # my blink tag tests
2338 #3.0fc3 - 6/18/2004 - MAP - fixed bug in _changeEncodingDeclaration that
2339 # failed to parse utf-16 encoded feeds; made source into a FeedParserDict;
2340 # duplicate admin:generatorAgent/@rdf:resource in generator_detail.url;
2341 # added support for image; refactored parse() fallback logic to try other
2342 # encodings if SAX parsing fails (previously it would only try other encodings
2343 # if re-encoding failed); remove unichr madness in normalize_attrs now that
2344 # we're properly tracking encoding in and out of BaseHTMLProcessor; set
2345 # feed.language from root-level xml:lang; set entry.id from rdf:about;
2346 # send Accept header
2347 #3.0 - 6/21/2004 - MAP - don't try iso-8859-1 (can't distinguish between
2348 # iso-8859-1 and windows-1252 anyway, and most incorrectly marked feeds are
2349 # windows-1252); fixed regression that could cause the same encoding to be
2350 # tried twice (even if it failed the first time)
2351 #3.0.1 - 6/22/2004 - MAP - default to us-ascii for all text/* content types;
2352 # recover from malformed content-type header parameter with no equals sign
2353 # ("text/xml; charset:iso-8859-1")
2354 #3.1 - 6/28/2004 - MAP - added and passed tests for converting HTML entities
2355 # to Unicode equivalents in illformed feeds (aaronsw); added and
2356 # passed tests for converting character entities to Unicode equivalents
2357 # in illformed feeds (aaronsw); test for valid parsers when setting
2358 # XML_AVAILABLE; make version and encoding available when server returns
2359 # a 304; add handlers parameter to pass arbitrary urllib2 handlers (like
2360 # digest auth or proxy support); add code to parse username/password
2361 # out of url and send as basic authentication; expose downloading-related
2362 # exceptions in bozo_exception (aaronsw); added __contains__ method to
2363 # FeedParserDict (aaronsw); added publisher_detail (aaronsw)
2364 #3.2 - 7/3/2004 - MAP - use cjkcodecs and iconv_codec if available; always
2365 # convert feed to UTF-8 before passing to XML parser; completely revamped
2366 # logic for determining character encoding and attempting XML parsing
2367 # (much faster); increased default timeout to 20 seconds; test for presence
2368 # of Location header on redirects; added tests for many alternate character
2369 # encodings; support various EBCDIC encodings; support UTF-16BE and
2370 # UTF16-LE with or without a BOM; support UTF-8 with a BOM; support
2371 # UTF-32BE and UTF-32LE with or without a BOM; fixed crashing bug if no
2372 # XML parsers are available; added support for "Content-encoding: deflate";
2373 # send blank "Accept-encoding: " header if neither gzip nor zlib modules
2374 # are available