products/CPSRSS

view RSSChannel.py @ 289:4c69e58c3c6a

hgbundler made release tag
author Georges Racinet on purity.racinet.fr <georges@racinet.fr>
date Wed, 23 Nov 2011 19:59:09 +0100
parents 8b941a0115c5
children
line source
1 # (C) Copyright 2003-2008 Nuxeo SAS <http://nuxeo.com>
2 # Authors:
3 # Emmanuel Pietriga (ep@nuxeo.com)
4 #
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License version 2 as published
7 # by the Free Software Foundation.
8 #
9 # This program is distributed 75in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU General Public License for more details.
13 #
14 # You should have received a copy of the GNU General Public License
15 # along with this program; if not, write to the Free Software
16 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
17 # 02111-1307, USA.
18 """The RSS tool manages RSS channels and refreshes them.
19 """
21 import logging
22 import time
23 import urllib
24 from urllib2 import ProxyHandler
26 from AccessControl import ClassSecurityInfo
27 from Globals import InitializeClass
30 from OFS.PropertyManager import PropertyManager
31 from Products.CMFCore.permissions import View
32 from Products.CMFCore.permissions import ModifyPortalContent
33 from Products.CMFCore.permissions import ManagePortal
34 from Products.CMFCore.utils import getToolByName
35 from Products.CMFCore.PortalContent import PortalContent
36 from Products.CMFDefault.DublinCore import DefaultDublinCoreImpl
38 from Products.CPSCore.EventServiceTool import getEventService
40 from zope.interface import implements
42 from Products.CPSRSS.interfaces import IRSSChannel
44 logger = logging.getLogger(__name__)
46 # (Ultraliberal RSS Parser) referred to as URP in this code
47 # http://feedparser.org/
48 # This parser is required for RSSChannel to function properly
49 # put feedreader.py in the same directory as RSSChannel.py
50 # or in your_zope_root/lib/python/
51 import feedparser
53 from sgmllib import SGMLParseError
55 RSSChannel_meta_type = 'RSS Channel'
57 factory_type_information = (
58 {'id': 'RSS Channel',
59 'description': 'RSS Channel',
60 'title': '',
61 'content_icon': 'document.gif',
62 'product': 'RSSTool',
63 'meta_type': RSSChannel_meta_type,
64 'factory': 'addRSSChannel',
65 'immediate_view': 'rsschannel_view',
66 'filter_content_types': 0,
67 'actions': ({'id': 'view',
68 'name': 'Voir',
69 'action': 'rsschannel_view',
70 'permissions': (View,),
71 'category': 'object',
72 },
73 {'id': 'edit',
74 'name': 'Modifier',
75 'action': 'rsschannel_edit_form',
76 'permissions': (ModifyPortalContent,),
77 'category': 'object',
78 },
79 ),
80 },
81 )
84 class RSSChannel(PortalContent, DefaultDublinCoreImpl):
85 """
86 RSSChannel handles calls to the RSS parser and reorganizes
87 resulting data structures (mainly filtering)
89 Restructuring gets rid of irrelevant data.
90 """
92 implements(IRSSChannel)
94 meta_type = RSSChannel_meta_type
95 portal_type = RSSChannel_meta_type # to be able to add CMF object via ZMI
97 security = ClassSecurityInfo()
98 security.declareObjectProtected(View)
100 _properties = (
101 {'id': 'title', 'type': 'ustring', 'mode': 'w',
102 'label': 'Title'},
103 {'id': 'description', 'type': 'utext', 'mode': 'w',
104 'label': 'Description'},
105 {'id': 'channel_url', 'type': 'string', 'mode':'w',
106 'label': 'Channel URL'},
107 {'id': 'channel_proxy', 'type': 'string', 'mode':'w',
108 'label': 'Proxy used to access channel'},
109 {'id': 'new_window', 'type': 'boolean', 'mode': 'w',
110 'label': 'Open Links in New Window'},
111 {'id': 'nbMaxItems', 'type': 'int', 'mode':'w',
112 'label': 'Maximum number of items'},
113 {'id': 'html_feed', 'type': 'boolean', 'mode': 'w',
114 'label': 'HTML feeds are provided untransformed'},
115 )
117 # Filled by a refresh
118 title = ''
119 description = ''
120 channel_url = ''
121 channel_proxy = ''
122 # True if links to news items should open in new windows
123 new_window = 0
124 # Maximum number of items, 0 means unlimited
125 nbMaxItems = 0
126 # True if the feed is already formatted in HTML,
127 # in which case we provide it "as is" to the box
128 html_feed = 0
130 # Remember last time we retrieved a feed so that we can manually
131 # tell feedparser to go find it again or not (trying to correct
132 # weird behaviour)
133 _etag = None
134 _modified = None
136 def __init__(self, id, channel_url='', channel_proxy='',
137 new_window=0, nbMaxItems=0, html_feed=0):
138 self.id = id
139 self.channel_url = channel_url
140 self.channel_proxy = channel_proxy
141 self.new_window = new_window
142 self.nbMaxItems = nbMaxItems
143 self.html_feed = html_feed
144 self._refresh_time = 0 # far in the past
145 self._data = {}
147 #
148 # API
149 #
150 security.declareProtected(ManagePortal, 'refresh')
151 def refresh(self):
152 """Refresh the channels from its source."""
154 self._refresh()
156 # notify the event service
157 evtool = getEventService(self)
158 evtool.notifyEvent('rss_channel_refresh', self, {})
160 security.declareProtected(View, 'getData')
161 def getData(self, maxItems=None):
162 """Get the data for this channel, as a dict."""
164 self._maybeRefresh()
165 data = self._data.copy()
166 if not self.html_feed:
167 lines = data.get('lines', [])
168 maxItems = maxItems or self.nbMaxItems
169 if maxItems:
170 # O special case.
171 # We want all the items
172 data.update({'lines': lines[:maxItems]})
173 return data
175 #
176 # internal
177 #
178 def _maybeRefresh(self):
179 """Refresh if on lazy refresh and the delay has elapsed."""
181 # GR: I find it doubtful that refresh lazyness and delay
182 # are set at the tool, because they are likely to depend on the
183 # rate of the feed itself. The tool should imo provide default values
184 # keeping status quo for now.
185 rss_tool = getToolByName(self, 'portal_rss')
186 if not rss_tool.lazy_refresh:
187 logger.debug('Not on lazy refresh')
188 self._refresh()
189 return
191 delay = rss_tool.refresh_delay
192 now = int(time.time())
193 if now - self._refresh_time > delay:
194 logger.debug('Refreshing %r', self.id)
195 self._refresh()
196 else:
197 logger.debug("Not refreshing %r (now=%s last=%s)", self.id,
198 now, self._refresh_time)
200 def _refresh(self):
201 """Refresh the channels from its source."""
203 if self.html_feed:
204 self._retrieveHTMLFeed()
205 else:
206 self._retrieveRSSFeed()
207 self._refresh_time = int(time.time())
209 def _retrieveRSSFeed(self):
210 """Call URP which will fetch and parse the RSS/XML feed"""
212 url = self.channel_url
213 if not url.startswith('http://') or url.startswith('https://'):
214 data = {'channel': {}, 'items': []}
215 try :
216 proxy = self.channel_proxy
217 # GR TODO : replace this by a portal-wide setting and maybe use
218 # system-wide setting (environ['http_proxy'])
219 if proxy:
220 logger.info("Using HTTP proxy %r", proxy)
221 proxy_handler = ProxyHandler({'http': proxy})
222 handlers = [proxy_handler]
223 else:
224 handlers = []
225 if self._data.get('items'):
226 data = feedparser.parse(url, self._etag, self._modified, handlers=handlers)
227 else:
228 data = feedparser.parse(url, None, None, handlers=handlers)
229 except SGMLParseError, err:
230 data = {'channel': {}, 'items': []}
231 logger.warn('RSS/SGML parsing error for feed at %s: %s', url, err)
232 except Timeout, err2:
233 data = {'channel': {}, 'items': []}
234 logger.warn('Timeout retrieving feed at %s: %s', url, err2)
236 if data.has_key('status') and data['status'] >= 400:
237 # If the http request fails, the description field could contain
238 # more info about why the request failed, like the error code (404,
239 # etc.) but this might be overly complex/geeky in the general
240 # context
241 self._data = {'title': "Broken RSS Channel",
242 'description': "URL " + url + " cannot be accessed.",
243 'url': url,
244 'lines': [],
245 'newWindow': self.new_window,
246 'feedType': 0, #RSS feed
247 }
248 else:
249 # Even if it succeeds, there might still be no data in the feed.
250 # This happens when the parser finds out that the feed has not
251 # changed since it was last retrieved.
252 if data['entries'] and data['feed'] :
253 # Avoid modifying persistent object if nothing has changed.
254 # data['entries'] is empty if nothing has changed since the
255 # feed was last retrieved.
257 # Filter and reorganize data generated by URP.
258 items = []
259 for it in data['entries']:
260 # Fill with actual values if exist (for robustness as this
261 # might depend on the quality of the feed)
262 item = {}
263 if it.has_key('link') and it.get('title', '').strip():
264 item['title'] = it['title']
265 item['url'] = it['link']
266 item['description'] = it.get('description', '')
267 item['author'] = it.get('author', '')
268 item['modified'] = it.get('modified','')
269 item['modified_parsed'] = it.get('modified_parsed','')
270 items.append(item)
271 # If the max number of items to be displayed is limited
272 # and the total number of items is higher, truncate.
273 if self.nbMaxItems and len(items) > self.nbMaxItems:
274 items = items[:self.nbMaxItems]
275 # feedType=0 indicates an RSS feed
276 filteredData = {'lines': items, 'newWindow': self.new_window,
277 'feedType': 0}
278 # init values
279 filteredData['title'] = ''
280 filteredData['description'] = ''
281 filteredData['url'] = ''
282 # Fill with actual values if exist (for robustness as this
283 # might depend on the quality of the feed).
284 if data.has_key('feed'):
285 chn = data['feed']
286 if chn.has_key('title'):
287 filteredData['title'] = chn['title']
288 if chn.has_key('description'):
289 filteredData['description'] = chn['description']
290 if chn.has_key('link'):
291 filteredData['url'] = chn['link']
292 if data.has_key('etag'):
293 self._etag = data['etag']
294 if data.has_key('modified'):
295 self._modified = data['modified']
296 self.title = filteredData['title']
297 if self.title is None or len(self.title) == 0 \
298 or self.title.isspace():
299 self.title = self.id
300 self.description = filteredData['description']
301 # Assign data to object.
302 if self._data != filteredData:
303 self._data = filteredData
304 else:
305 if not self._data.get('title', '').strip():
306 self._data['title'] = self.id
307 if not self._data.has_key('description'):
308 self._data['description'] = ''
309 if not self._data.get('url', '').strip():
310 self._data['url'] = url
311 if not self._data.has_key('lines'):
312 self._data['lines'] = []
313 if not self._data.has_key('newWindow'):
314 self._data['newWindow'] = self.new_window
315 if not self._data.has_key('feedType'):
316 self._data['feedType'] = 0
317 self.title = self._data['title']
318 if self.title is None or len(self.title) == 0 \
319 or self.title.isspace():
320 self.title = self.id
321 self.description = self._data['description']
322 if data.has_key('etag'):
323 self._etag = data['etag']
324 if data.has_key('modified'):
325 self._modified = data['modified']
327 def _retrieveHTMLFeed(self):
328 """Fetch an HTML feed"""
330 url = self.channel_url
331 if not url.startswith('http://') or url.startswith('https://'):
332 html_data = ''
333 self.title = 'HTML Feed'
334 self.description = "This feed has been formatted in HTML on the " \
335 "server side. It can only be displayed as is ; no other " \
336 "information is available."
337 try:
338 f = urllib.urlopen(url)
339 html_data = f.read()
340 except IOError:
341 html_data = ''
342 self.description = "An error occured while retrieving this feed"
343 data = {'feedType': 1, 'htmlData': html_data}
344 if self._data != data:
345 # Avoid modifying persistent object if nothing has changed.
346 self._data = data
348 #
349 # ZMI
350 #
351 manage_options = (PropertyManager.manage_options + # Properties
352 PortalContent.manage_options[:1] + # skip Edit
353 PortalContent.manage_options[3:])
355 InitializeClass(RSSChannel)
357 def addRSSChannel(container, id, channel_url, REQUEST=None, **kw):
358 """Create an empty RSS Channel."""
359 ob = RSSChannel(id, channel_url)
360 container._setObject(id, ob)
361 ob = container._getOb(id)
362 if REQUEST:
363 url = container.absolute_url()
364 REQUEST.RESPONSE.redirect('%s/manage_main' % url)