products/CPSRSS

changeset 215:8d651512b305 authforward

Rewrote caching logic to be potentially user dependent. Unit tested only.
author Georges Racinet on purity.racinet.fr <georges@racinet.fr>
date Wed, 14 Jul 2010 19:49:20 +0200
parents 4b03e0702b05
children 30f6d0876296
files RSSChannel.py tests/testRSS.py
diffstat 2 files changed, 98 insertions(+), 50 deletions(-) [+]
line diff
     1.1 --- a/RSSChannel.py
     1.2 +++ b/RSSChannel.py
     1.3 @@ -30,6 +30,7 @@
     1.4  import socket
     1.5  
     1.6  from AccessControl import ClassSecurityInfo
     1.7 +from AccessControl.SecurityManagement import getSecurityManager
     1.8  from Globals import InitializeClass
     1.9  
    1.10  
    1.11 @@ -120,6 +121,8 @@
    1.12           'label': 'Maximum number of items'},
    1.13          {'id': 'html_feed', 'type': 'boolean', 'mode': 'w',
    1.14           'label': 'HTML feeds are provided untransformed'},
    1.15 +        {'id': 'max_stale', 'type': 'int', 'mode': 'w',
    1.16 +         'label': 'Maximum time to keep stale data (seconds)'}
    1.17      )
    1.18  
    1.19      # Filled by a refresh
    1.20 @@ -136,11 +139,7 @@
    1.21      # in which case we provide it "as is" to the box
    1.22      html_feed = 0
    1.23  
    1.24 -    # Remember last time we retrieved a feed so that we can manually
    1.25 -    # tell feedparser to go find it again or not (trying to correct
    1.26 -    # weird behaviour)
    1.27 -    _etag = None
    1.28 -    _modified = None
    1.29 +    max_stale = 0 # means 10x refresh_delay
    1.30  
    1.31      def __init__(self, id, channel_url='', channel_proxy='',
    1.32                   new_window=0, nbMaxItems=0, html_feed=0):
    1.33 @@ -150,9 +149,17 @@
    1.34          self.new_window = new_window
    1.35          self.nbMaxItems = nbMaxItems
    1.36          self.html_feed = html_feed
    1.37 -        self._refresh_time = 0 # far in the past
    1.38 +        # user-dependent caches (user_id -> data)
    1.39 +        # in case this channel is user-independent, the None key
    1.40 +        # is used for code homogeneity
    1.41          self._data = {}
    1.42  
    1.43 +        self._refresh_time = {} # (user_id -> time), same convention
    1.44 +        # user_id (or None) -> dict for etag and modified)
    1.45 +        # GR this is kept separate from _data in order to minimize code
    1.46 +        # to be updated and regression risks.
    1.47 +        self.http_caching_headers = {}
    1.48 +
    1.49      #
    1.50      # API
    1.51      #
    1.52 @@ -170,8 +177,14 @@
    1.53      def getData(self, maxItems=None):
    1.54          """Get the data for this channel, as a dict."""
    1.55  
    1.56 -        self._maybeRefresh()
    1.57 -        data = self._data.copy()
    1.58 +        if self.forward_auth:
    1.59 +            user_id = getSecurityManager().getUser().getId()
    1.60 +        else:
    1.61 +            user_id = None
    1.62 +
    1.63 +        self._maybeRefresh(user_id=user_id)
    1.64 +        data = self._data[user_id].copy()
    1.65 +
    1.66          if not self.html_feed:
    1.67              lines = data.get('lines', [])
    1.68              maxItems = maxItems or self.nbMaxItems
    1.69 @@ -184,37 +197,62 @@
    1.70      #
    1.71      # internal
    1.72      #
    1.73 -    def _maybeRefresh(self):
    1.74 +    def _maybeRefresh(self, user_id=None):
    1.75          """Refresh if on lazy refresh and the delay has elapsed."""
    1.76  
    1.77 +        # cleaning up all stale caches safe this one in order to ease on memory
    1.78 +        # stale is stronger than to be refreshed (we have last_modified, etag
    1.79 +        # and such for the latter).
    1.80 +        for u, t in self._refresh_time.items():
    1.81 +            if u == user_id:
    1.82 +                continue
    1.83 +            if now - t > self.max_stale:
    1.84 +                del self._refresh_time[u]
    1.85 +                self._data.pop(u, None) # pop for robustness
    1.86 +
    1.87 +                logger.debug(
    1.88 +                    "Removed stale cache (now=%s latest=%s) for user '%s'",
    1.89 +                     now, t, user_id)
    1.90 +
    1.91          if not self.lazy_refresh: # acquired from parent (portal_rss)
    1.92 -            LOG('RSSChannel refresh', DEBUG, 'not on lazy refresh')
    1.93 -            self._refresh()
    1.94 +            logger.debug('_maybeRefresh: not on lazy refresh')
    1.95 +            self._refresh(user_id=user_id)
    1.96              return
    1.97          delay = self.refresh_delay # acquired from parent (portal_rss)
    1.98          now = int(time.time())
    1.99 -        if now - self._refresh_time > delay:
   1.100 -            LOG('RSSChannel refresh', DEBUG, ' refreshing')
   1.101 -            self._refresh()
   1.102 +
   1.103 +        latest = self._refresh_time.get(user_id, 0)
   1.104 +        if user_id is None:
   1.105 +            logger.debug("_maybeRefresh: checking %s cache ", self)
   1.106 +        logger.debug("_maybeRefresh: checking %s cache for user '%s'",
   1.107 +                     self, user_id)
   1.108 +
   1.109 +        if now - latest > delay:
   1.110 +            logger.debug('_maybeRefresh: refreshing %s', self)
   1.111 +            self._refresh(user_id=user_id)
   1.112          else:
   1.113 -            LOG('RSSChannel refresh', DEBUG, 'not refreshing (now=%s last=%s)' %
   1.114 -                (now, self._refresh_time))
   1.115 +            logger.debug('_maybeRefresh: not refreshing %s (now=%s latest=%s)',
   1.116 +                         self, now, latest)
   1.117  
   1.118 -    def _refresh(self):
   1.119 +    def _refresh(self, user_id=None):
   1.120          """Refresh the channels from its source."""
   1.121  
   1.122          if self.html_feed:
   1.123 -            self._retrieveHTMLFeed()
   1.124 +            self._retrieveHTMLFeed(user_id=user_id)
   1.125          else:
   1.126 -            self._retrieveRSSFeed()
   1.127 -        self._refresh_time = int(time.time())
   1.128 +            self._retrieveRSSFeed(user_id=user_id)
   1.129  
   1.130 -    def _retrieveRSSFeed(self):
   1.131 +        now = int(time.time())
   1.132 +        self._refresh_time[user_id] = now
   1.133 +
   1.134 +    def _retrieveRSSFeed(self, user_id=None):
   1.135          """Call URP which will fetch and parse the RSS/XML feed"""
   1.136  
   1.137          url = self.channel_url
   1.138          if not url.startswith('http://') or url.startswith('https://'):
   1.139              data = {'channel': {}, 'items': []}
   1.140 +
   1.141 +        existing_data = self._data.setdefault(user_id, {})
   1.142          handlers = []
   1.143          try :
   1.144              if self.channel_proxy:
   1.145 @@ -228,8 +266,11 @@
   1.146                                  self.getId(), self.title)
   1.147                  else:
   1.148                      handlers = [atool.getForwarderHandler()]
   1.149 -            if self._data.get('items'):
   1.150 -                data = feedparser.parse(url, self._etag, self._modified, handlers=handlers)
   1.151 +            if existing_data.get('items'):
   1.152 +                headers = self.http_caching_headers.get(user_id, {})
   1.153 +                data = feedparser.parse(url, headers.get('etag'),
   1.154 +                                        headers.get('modified'),
   1.155 +                                        handlers=handlers)
   1.156              else:
   1.157                  data = feedparser.parse(url, None, None, handlers=handlers)
   1.158          except SGMLParseError, err:
   1.159 @@ -251,7 +292,7 @@
   1.160              # more info about why the request failed, like the error code (404,
   1.161              # etc.) but this might be overly complex/geeky in the general
   1.162              # context
   1.163 -            self._data = {'title': "Broken RSS Channel",
   1.164 +            self._data[user_id] = {'title': "Broken RSS Channel",
   1.165                            'description': "URL " + url + " cannot be accessed.",
   1.166                            'url': url,
   1.167                            'lines': [],
   1.168 @@ -302,40 +343,44 @@
   1.169                          filteredData['description'] = chn['description']
   1.170                      if chn.has_key('link'):
   1.171                          filteredData['url'] = chn['link']
   1.172 -                if data.has_key('etag'):
   1.173 -                    self._etag = data['etag']
   1.174 -                if data.has_key('modified'):
   1.175 -                    self._modified = data['modified']
   1.176 +
   1.177 +                self._store_headers(user_id, data)
   1.178                  self.title = filteredData['title']
   1.179                  if self.title is None or len(self.title) == 0 \
   1.180                    or self.title.isspace():
   1.181                      self.title = self.id
   1.182                  self.description = filteredData['description']
   1.183                  # Assign data to object.
   1.184 -                if self._data != filteredData:
   1.185 -                    self._data = filteredData
   1.186 +                if existing_data != filteredData:
   1.187 +                    self._data[user_id] = filteredData
   1.188              else:
   1.189 -                if not self._data.get('title', '').strip():
   1.190 -                    self._data['title'] = self.id
   1.191 -                if not self._data.has_key('description'):
   1.192 -                    self._data['description'] = ''
   1.193 -                if not self._data.get('url', '').strip():
   1.194 -                    self._data['url'] = url
   1.195 -                if not self._data.has_key('lines'):
   1.196 -                    self._data['lines'] = []
   1.197 -                if not self._data.has_key('newWindow'):
   1.198 -                    self._data['newWindow'] = self.new_window
   1.199 -                if not self._data.has_key('feedType'):
   1.200 -                    self._data['feedType'] = 0
   1.201 -                self.title = self._data['title']
   1.202 +                # we simply update the existing data
   1.203 +                if not existing_data.get('title', '').strip():
   1.204 +                    existing_data['title'] = self.id
   1.205 +                if not existing_data.has_key('description'):
   1.206 +                    existing_data['description'] = ''
   1.207 +                if not existing_data.get('url', '').strip():
   1.208 +                    existing_data['url'] = url
   1.209 +                if not existing_data.has_key('lines'):
   1.210 +                    existing_data['lines'] = []
   1.211 +                if not existing_data.has_key('newWindow'):
   1.212 +                    existing_data['newWindow'] = self.new_window
   1.213 +                if not existing_data.has_key('feedType'):
   1.214 +                    existing_data['feedType'] = 0
   1.215 +                self.title = existing_data['title']
   1.216                  if self.title is None or len(self.title) == 0 \
   1.217                    or self.title.isspace():
   1.218                      self.title = self.id
   1.219 -                self.description = self._data['description']
   1.220 -                if data.has_key('etag'):
   1.221 -                    self._etag = data['etag']
   1.222 -                if data.has_key('modified'):
   1.223 -                    self._modified = data['modified']
   1.224 +                self.description = existing_data['description']
   1.225 +                self._store_headers(user_id, data)
   1.226 +
   1.227 +    def _store_headers(self, user_id, data):
   1.228 +        """Extracts from data and store HTTP caching related headers."""
   1.229 +        headers = self.http_caching_headers.setdefault(user_id, {})
   1.230 +        if data.has_key('etag'):
   1.231 +            headers['etag'] = data['etag']
   1.232 +        if data.has_key('modified'):
   1.233 +            headers['modified'] = data['modified']
   1.234  
   1.235      def _retrieveHTMLFeed(self):
   1.236          """Fetch an HTML feed"""
   1.237 @@ -354,9 +399,9 @@
   1.238              html_data = ''
   1.239              self.description = "An error occured while retrieving this feed"
   1.240          data = {'feedType': 1, 'htmlData': html_data}
   1.241 -        if self._data != data:
   1.242 +        if self._data.get(user_id) != data:
   1.243              # Avoid modifying persistent object if nothing has changed.
   1.244 -            self._data = data
   1.245 +            self._data[user_id] = data
   1.246  
   1.247      #
   1.248      # ZMI
     2.1 --- a/tests/testRSS.py
     2.2 +++ b/tests/testRSS.py
     2.3 @@ -48,6 +48,9 @@
     2.4      def testChannelNotLazy(self):
     2.5          self._testChannel(lazy_refresh=0)
     2.6  
     2.7 +    def testPerUserCache(self):
     2.8 +        pass
     2.9 +
    2.10      def testExcHandling(self):
    2.11          rss_tool = self.portal.portal_rss
    2.12          rss_tool.lazy_refresh = False