products/CPSRSS

view RSSChannel.py @ 216:30f6d0876296

Testing for per-user caching capabilities
author Georges Racinet on purity.racinet.fr <georges@racinet.fr>
date Thu, 15 Jul 2010 11:29:51 +0200
parents 8d651512b305
children e45ebb5d4820
line source
1 # (C) Copyright 2003-2008 Nuxeo SAS <http://nuxeo.com>
2 # Authors:
3 # Emmanuel Pietriga (ep@nuxeo.com)
4 #
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License version 2 as published
7 # by the Free Software Foundation.
8 #
9 # This program is distributed 75in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU General Public License for more details.
13 #
14 # You should have received a copy of the GNU General Public License
15 # along with this program; if not, write to the Free Software
16 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
17 # 02111-1307, USA.
18 #
19 # $Id$
20 """The RSS tool manages RSS channels and refreshes them.
21 """
23 from zLOG import LOG, DEBUG
25 import logging
26 import time
27 import urllib
28 from urllib2 import ProxyHandler
29 from urllib2 import URLError
30 import socket
32 from AccessControl import ClassSecurityInfo
33 from AccessControl.SecurityManagement import getSecurityManager
34 from Globals import InitializeClass
37 from OFS.PropertyManager import PropertyManager
38 from Products.CMFCore.permissions import View
39 from Products.CMFCore.permissions import ModifyPortalContent
40 from Products.CMFCore.permissions import ManagePortal
41 from Products.CMFCore.utils import getToolByName
42 from Products.CMFCore.PortalContent import PortalContent
43 from Products.CMFDefault.DublinCore import DefaultDublinCoreImpl
45 from Products.CPSCore.EventServiceTool import getEventService
47 from zope.interface import implements
49 from Products.CPSRSS.interfaces import IRSSChannel
51 logger = logging.getLogger('Products.CPSRSS.RSSChannel')
53 # (Ultraliberal RSS Parser) referred to as URP in this code
54 # http://feedparser.org/
55 # This parser is required for RSSChannel to function properly
56 # put feedreader.py in the same directory as RSSChannel.py
57 # or in your_zope_root/lib/python/
58 import feedparser
60 from sgmllib import SGMLParseError
62 RSSChannel_meta_type = 'RSS Channel'
64 factory_type_information = (
65 {'id': 'RSS Channel',
66 'description': 'RSS Channel',
67 'title': '',
68 'content_icon': 'document.gif',
69 'product': 'RSSTool',
70 'meta_type': RSSChannel_meta_type,
71 'factory': 'addRSSChannel',
72 'immediate_view': 'rsschannel_view',
73 'filter_content_types': 0,
74 'actions': ({'id': 'view',
75 'name': 'Voir',
76 'action': 'rsschannel_view',
77 'permissions': (View,),
78 'category': 'object',
79 },
80 {'id': 'edit',
81 'name': 'Modifier',
82 'action': 'rsschannel_edit_form',
83 'permissions': (ModifyPortalContent,),
84 'category': 'object',
85 },
86 ),
87 },
88 )
91 class RSSChannel(PortalContent, DefaultDublinCoreImpl):
92 """
93 RSSChannel handles calls to the RSS parser and reorganizes
94 resulting data structures (mainly filtering)
96 Restructuring gets rid of irrelevant data.
97 """
99 implements(IRSSChannel)
101 meta_type = RSSChannel_meta_type
102 portal_type = RSSChannel_meta_type # to be able to add CMF object via ZMI
104 security = ClassSecurityInfo()
105 security.declareObjectProtected(View)
107 _properties = (
108 {'id': 'title', 'type': 'string', 'mode': 'w',
109 'label': 'Title'},
110 {'id': 'description', 'type': 'text', 'mode': 'w',
111 'label': 'Description'},
112 {'id': 'channel_url', 'type': 'string', 'mode':'w',
113 'label': 'Channel URL'},
114 {'id': 'channel_proxy', 'type': 'string', 'mode':'w',
115 'label': 'Proxy used to access channel'},
116 {'id': 'new_window', 'type': 'boolean', 'mode': 'w',
117 'label': 'Open Links in New Window'},
118 {'id': 'forward_auth', 'type': 'boolean', 'mode': 'w',
119 'label': 'Forward current authentication to retrieve feed'},
120 {'id': 'nbMaxItems', 'type': 'int', 'mode':'w',
121 'label': 'Maximum number of items'},
122 {'id': 'html_feed', 'type': 'boolean', 'mode': 'w',
123 'label': 'HTML feeds are provided untransformed'},
124 {'id': 'max_stale', 'type': 'int', 'mode': 'w',
125 'label': 'Maximum time to keep stale data (seconds)'}
126 )
128 # Filled by a refresh
129 title = ''
130 description = ''
131 channel_url = ''
132 channel_proxy = ''
133 # True if links to news items should open in new windows
134 forward_auth = False
135 new_window = 0
136 # Maximum number of items, 0 means unlimited
137 nbMaxItems = 0
138 # True if the feed is already formatted in HTML,
139 # in which case we provide it "as is" to the box
140 html_feed = 0
142 max_stale = 0 # means 10x refresh_delay
144 def __init__(self, id, channel_url='', channel_proxy='',
145 new_window=0, nbMaxItems=0, html_feed=0):
146 self.id = id
147 self.channel_url = channel_url
148 self.channel_proxy = channel_proxy
149 self.new_window = new_window
150 self.nbMaxItems = nbMaxItems
151 self.html_feed = html_feed
152 # user-dependent caches (user_id -> data)
153 # in case this channel is user-independent, the None key
154 # is used for code homogeneity
155 self._data = {}
157 self._refresh_time = {} # (user_id -> time), same convention
158 # user_id (or None) -> dict for etag and modified)
159 # GR this is kept separate from _data in order to minimize code
160 # to be updated and regression risks.
161 self.http_caching_headers = {}
163 #
164 # API
165 #
166 security.declareProtected(ManagePortal, 'refresh')
167 def refresh(self):
168 """Refresh the channels from its source."""
170 self._refresh()
172 # notify the event service
173 evtool = getEventService(self)
174 evtool.notifyEvent('rss_channel_refresh', self, {})
176 security.declareProtected(View, 'getData')
177 def getData(self, maxItems=None, user_id=None):
178 """Get the data for this channel, as a dict.
179 If user_id is specified, a separate cache is kept for this user.
180 This is done anyway in case auth is being forwarded. Having this as
181 a direct option is especially useful for tests.
182 """
184 if user_id is None and self.forward_auth:
185 user_id = getSecurityManager().getUser().getId()
187 self._maybeRefresh(user_id=user_id)
188 data = self._data[user_id].copy()
190 if not self.html_feed:
191 lines = data.get('lines', [])
192 maxItems = maxItems or self.nbMaxItems
193 if maxItems:
194 # O special case.
195 # We want all the items
196 data.update({'lines': lines[:maxItems]})
197 return data
199 #
200 # internal
201 #
202 def _maybeRefresh(self, user_id=None):
203 """Refresh if on lazy refresh and the delay has elapsed."""
205 now = int(time.time())
207 delay = self.refresh_delay # acquired from parent (portal_rss)
209 # cleaning up all stale caches safe this one in order to ease on memory
210 # stale is stronger than to be refreshed (we have last_modified, etag
211 # and such for the latter).
212 max_stale = self.max_stale
213 if max_stale == 0:
214 max_stale = delay * 10
215 for u, t in self._refresh_time.items():
216 if u == user_id:
217 continue
218 if now - t > self.max_stale:
219 del self._refresh_time[u]
220 self._data.pop(u, None) # pop for robustness
222 logger.debug(
223 "Removed stale cache (now=%s latest=%s) for user '%s'",
224 now, t, user_id)
226 if not self.lazy_refresh: # acquired from parent (portal_rss)
227 logger.debug('_maybeRefresh: not on lazy refresh')
228 self._refresh(user_id=user_id)
229 return
231 latest = self._refresh_time.get(user_id, 0)
232 if user_id is None:
233 logger.debug("_maybeRefresh: checking %s cache ", self)
234 logger.debug("_maybeRefresh: checking %s cache for user '%s'",
235 self, user_id)
237 if now - latest > delay:
238 logger.debug('_maybeRefresh: refreshing %s', self)
239 self._refresh(user_id=user_id)
240 else:
241 logger.debug('_maybeRefresh: not refreshing %s (now=%s latest=%s)',
242 self, now, latest)
244 def _refresh(self, user_id=None):
245 """Refresh the channels from its source."""
247 if self.html_feed:
248 self._retrieveHTMLFeed(user_id=user_id)
249 else:
250 self._retrieveRSSFeed(user_id=user_id)
252 now = int(time.time())
253 self._refresh_time[user_id] = now
255 def _retrieveRSSFeed(self, user_id=None):
256 """Call URP which will fetch and parse the RSS/XML feed"""
258 url = self.channel_url
259 if not url.startswith('http://') or url.startswith('https://'):
260 data = {'channel': {}, 'items': []}
262 existing_data = self._data.setdefault(user_id, {})
263 handlers = []
264 try :
265 if self.channel_proxy:
266 proxy_handler = ProxyHandler({'http': self.channel_proxy})
267 handlers = [proxy_handler]
268 elif self.forward_auth:
269 atool = getToolByName(self, 'extended_authentication', None)
270 if atool is None:
271 logger.warn("In channel '%s' (%s): forward_auth needs "
272 "CPSExtendedAuthentication",
273 self.getId(), self.title)
274 else:
275 handlers = [atool.getForwarderHandler()]
276 if existing_data.get('items'):
277 headers = self.http_caching_headers.get(user_id, {})
278 data = feedparser.parse(url, headers.get('etag'),
279 headers.get('modified'),
280 handlers=handlers)
281 else:
282 data = feedparser.parse(url, None, None, handlers=handlers)
283 except SGMLParseError, err:
284 data = {'channel': {}, 'items': []}
285 LOG('RSSChannel Error', DEBUG,
286 'RSS/SGML parsing error while retrieving feed\n'
287 +str(url)+'\n'+str(err))
288 except URLError, err:
289 data = {'channel': {}, 'items': []}
290 if isinstance(getattr(err, 'reason', None), socket.timeout):
291 logger.info("Timeout on %s for feed '%s' (%s)",
292 url, self.getId(), self.title)
293 else:
294 logger.exception("Error on %s for feed '%s' (%s)",
295 url, self.getId(), self.title)
297 if data.has_key('status') and data['status'] >= 400:
298 # If the http request fails, the description field could contain
299 # more info about why the request failed, like the error code (404,
300 # etc.) but this might be overly complex/geeky in the general
301 # context
302 self._data[user_id] = {'title': "Broken RSS Channel",
303 'description': "URL " + url + " cannot be accessed.",
304 'url': url,
305 'lines': [],
306 'newWindow': self.new_window,
307 'feedType': 0, #RSS feed
308 }
309 else:
310 # Even if it succeeds, there might still be no data in the feed.
311 # This happens when the parser finds out that the feed has not
312 # changed since it was last retrieved.
313 if data['entries'] and data['feed'] :
314 # Avoid modifying persistent object if nothing has changed.
315 # data['entries'] is empty if nothing has changed since the
316 # feed was last retrieved.
318 # Filter and reorganize data generated by URP.
319 items = []
320 for it in data['entries']:
321 # Fill with actual values if exist (for robustness as this
322 # might depend on the quality of the feed)
323 item = {}
324 if it.has_key('link') and it.get('title', '').strip():
325 item['title'] = it['title']
326 item['url'] = it['link']
327 item['description'] = it.get('description', '')
328 item['author'] = it.get('author', '')
329 item['modified'] = it.get('modified','')
330 item['modified_parsed'] = it.get('modified_parsed','')
331 items.append(item)
332 # If the max number of items to be displayed is limited
333 # and the total number of items is higher, truncate.
334 if self.nbMaxItems and len(items) > self.nbMaxItems:
335 items = items[:self.nbMaxItems]
336 # feedType=0 indicates an RSS feed
337 filteredData = {'lines': items, 'newWindow': self.new_window,
338 'feedType': 0}
339 # init values
340 filteredData['title'] = ''
341 filteredData['description'] = ''
342 filteredData['url'] = ''
343 # Fill with actual values if exist (for robustness as this
344 # might depend on the quality of the feed).
345 if data.has_key('feed'):
346 chn = data['feed']
347 if chn.has_key('title'):
348 filteredData['title'] = chn['title']
349 if chn.has_key('description'):
350 filteredData['description'] = chn['description']
351 if chn.has_key('link'):
352 filteredData['url'] = chn['link']
354 self._store_headers(user_id, data)
355 self.title = filteredData['title']
356 if self.title is None or len(self.title) == 0 \
357 or self.title.isspace():
358 self.title = self.id
359 self.description = filteredData['description']
360 # Assign data to object.
361 if existing_data != filteredData:
362 self._data[user_id] = filteredData
363 else:
364 # we simply update the existing data
365 if not existing_data.get('title', '').strip():
366 existing_data['title'] = self.id
367 if not existing_data.has_key('description'):
368 existing_data['description'] = ''
369 if not existing_data.get('url', '').strip():
370 existing_data['url'] = url
371 if not existing_data.has_key('lines'):
372 existing_data['lines'] = []
373 if not existing_data.has_key('newWindow'):
374 existing_data['newWindow'] = self.new_window
375 if not existing_data.has_key('feedType'):
376 existing_data['feedType'] = 0
377 self.title = existing_data['title']
378 if self.title is None or len(self.title) == 0 \
379 or self.title.isspace():
380 self.title = self.id
381 self.description = existing_data['description']
382 self._store_headers(user_id, data)
384 def _store_headers(self, user_id, data):
385 """Extracts from data and store HTTP caching related headers."""
386 headers = self.http_caching_headers.setdefault(user_id, {})
387 if data.has_key('etag'):
388 headers['etag'] = data['etag']
389 if data.has_key('modified'):
390 headers['modified'] = data['modified']
392 def _retrieveHTMLFeed(self):
393 """Fetch an HTML feed"""
395 url = self.channel_url
396 if not url.startswith('http://') or url.startswith('https://'):
397 html_data = ''
398 self.title = 'HTML Feed'
399 self.description = "This feed has been formatted in HTML on the " \
400 "server side. It can only be displayed as is ; no other " \
401 "information is available."
402 try:
403 f = urllib.urlopen(url)
404 html_data = f.read()
405 except IOError:
406 html_data = ''
407 self.description = "An error occured while retrieving this feed"
408 data = {'feedType': 1, 'htmlData': html_data}
409 if self._data.get(user_id) != data:
410 # Avoid modifying persistent object if nothing has changed.
411 self._data[user_id] = data
413 #
414 # ZMI
415 #
416 manage_options = (PropertyManager.manage_options + # Properties
417 PortalContent.manage_options[:1] + # skip Edit
418 PortalContent.manage_options[3:])
420 InitializeClass(RSSChannel)
422 def addRSSChannel(container, id, channel_url, REQUEST=None, **kw):
423 """Create an empty RSS Channel."""
424 ob = RSSChannel(id, channel_url)
425 container._setObject(id, ob)
426 ob = container._getOb(id)
427 if REQUEST:
428 url = container.absolute_url()
429 REQUEST.RESPONSE.redirect('%s/manage_main' % url)