view bzapi.py @ 28:ce19838a318d

we now split large batches of bugs into small segments of 10 each
author Atul Varma <varmaa@toolness.com>
date Thu, 24 Dec 2009 11:54:11 -0800
parents f717ecd3ede1
children 9a7052db1045
line wrap: on
line source

import logging
import urllib2
import urllib
from datetime import datetime

import pymongo
import simplejson as json

def split_seq(seq, size):
    """ Split up seq in pieces of size """

    # Taken from http://code.activestate.com/recipes/425044/
    return [seq[i:i+size] for i  in range(0, len(seq), size)]

def open_url(url, headers, query_args=None, urllib2=urllib2):
    if query_args:
        full_url = "%s?%s" % (url, urllib.urlencode(query_args))
    else:
        full_url = url

    logging.debug('retrieving %s' % full_url)
    request = urllib2.Request(full_url)

    for name, value in headers.items():
        request.add_header(name, value)

    return urllib2.urlopen(request)

def normalize_bug(bug):
    for name in ['last_change_time', 'creation_time']:
        bug[name] = datetime_from_iso(bug[name])
    bug['_id'] = bug['id']

def datetime_from_rfc1123(timestamp):
    return datetime.strptime(timestamp, '%a, %d %b %Y %H:%M:%S GMT')

def datetime_to_iso(dt):
    return "%sZ" % (dt.replace(microsecond=0).isoformat('T'))

def datetime_from_iso(timestamp):
    return datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%SZ')

def sanitize(obj):
    if type(obj) == dict:
        bad_names = [name for name in obj
                     if "." in name]
        for name in bad_names:
            new_name = name.replace('.', '_DOT_')
            obj[new_name] = obj[name]
            del obj[name]
        for name in obj:
            sanitize(obj[name])
    elif type(obj) == list:
        for item in obj:
            sanitize(item)

class CachedSearch(object):
    MAX_BUG_BATCH_SIZE = 10

    def __init__(self, api, collection, **kwargs):
        self.observers = []
        self.options = kwargs

        self.bugs = collection
        self.api = api
        self._update_last_update()

    def add_observer(self, observer):
        self.observers.append(observer)

    def _update_last_update(self):
        bugs = self.bugs.find().sort("retrieved_time",
                                     pymongo.ASCENDING).limit(1)
        if bugs.count() == 0:
            self.last_update = None
        else:
            self.last_update = bugs[0]['retrieved_time']

    def _retrieve_full_bugs(self, bug_ids):
        params = {'id': ','.join(bug_ids),
                  'id_mode': 'include',
                  'comments': '1',
                  'history': '1'}
        response = self.api.get('/bug', **params)
        bugs = response['data']['bugs']
        for bug in bugs:
            logging.debug('updating bug %s' % bug['id'])
            normalize_bug(bug)
            bug['retrieved_time'] = response['date']
            bug['needs_full_update'] = False
            self.bugs.save(bug)
            for observer in self.observers:
                observer.notify({'bug': bug['id']})

    def update(self):
        params = {}
        params.update(self.options)
        if self.last_update:
            params['changed_after'] = self.last_update
        response = self.api.get('/bug', **params)
        bugs = response['data']['bugs']
        for bug in bugs:
            normalize_bug(bug)
            old_bug = self.bugs.find_one({'id': bug['id']})
            if ((old_bug is None) or
                (bug['last_change_time'] > old_bug['last_change_time'])):
                if not old_bug:
                    old_bug = bug
                old_bug['needs_full_update'] = True
            old_bug['retrieved_time'] = response['date']
            self.bugs.save(old_bug)
        bugs_to_update = self.bugs.find({'needs_full_update': True})
        bug_ids = [bug['id'] for bug in bugs_to_update]
        if bug_ids:
            for segment in split_seq(bug_ids, self.MAX_BUG_BATCH_SIZE):
                self._retrieve_full_bugs(segment)
            self._update_last_update()

class BugzillaApi(object):
    def __init__(self, base_url, collection, username=None, password=None,
                 open_url=open_url):
        self._open_url = open_url

        self.base_url = base_url
        self.username = username
        self.password = password
        config = collection.find_one()
        if not config:
            config = self.get('/configuration')['data']
            sanitize(config)
            collection.insert(config)
        self.config = config

    def _validate_component(self, product, component=None):
        products = self.config['product']
        if product not in products:
            msg = 'product %s not in configuration' % repr(product)
            raise ValueError(msg)
        if component and component not in products[product]['component']:
            msg = 'component %s of product %s not in configuration' % (
                repr(component),
                repr(product)
                )
            raise ValueError(msg)

    def get(self, url, **kwargs):
        now = datetime.utcnow().replace(microsecond=0)

        for name, value in kwargs.items():
            if isinstance(value, datetime):
                kwargs[name] = datetime_to_iso(value)

        params = {}
        if self.username and self.password:
            params.update({'username': self.username,
                           'password': self.password})
        params.update(kwargs)

        if 'product' in params:
            self._validate_component(params['product'],
                                     params.get('component'))

        response = self._open_url(
            url=self.base_url + url,
            query_args=params,
            headers={'Accept': 'application/json',
                     'Content-Type': 'application/json'},
            )

        # TODO: instead of 'now', we'd like to use the 'Date'
        # HTTP header, but it's actually completely wrong in
        # the case of bugzilla.mozilla.org, so we'll assume
        # our timekeeping is better.

        #'date': datetime_from_rfc1123(response.info()['Date'])}

        return {'data': json.loads(response.read()),
                'date': now}