view publish_edition.py @ 21:6a6632954dc6

added logging statement while parsing feeds
author Atul Varma <varmaa@toolness.com>
date Sun, 03 Jan 2010 04:41:44 -0800
parents 5e7ecaf68b9a
children f2fafca30ff3
line wrap: on
line source

import os
import logging
import traceback
import optparse
import cPickle as pickle
from datetime import date, datetime, timedelta

import feedparser
from whoisi_cache import WhoisiServer, WhoisiCache, json
from url_cache import UrlCache

AUTHORS_FILENAME = 'authors.txt'
URLS_FILENAME = 'url_cache.dat'
WHOISI_FILENAME = 'whoisi_cache.dat'
ARTICLES_FILENAME = 'articles.dat'
JSON_FILENAME = 'daily-edition.json'

def load(filename, default):
    if os.path.exists(filename):    
        return pickle.load(open(filename, 'r'))
    return default

def save(obj, filename):
    f = open(filename, 'w')
    pickle.dump(obj, f)
    f.close()

def to_date_tuple(dt):
    return (dt.year, dt.month, dt.day)

def refresh_urls(feeds, urls):
    ucache = UrlCache(urls)
    for feed_urls in feeds.values():
        for url in feed_urls:
            try:
                ucache.refresh(url)
            except Exception, e:
                traceback.print_exc(e)

def refresh_articles(articles, feeds, urls):
    for author, feed_urls in feeds.items():
        articles[author] = []
        for url in feed_urls:
            logging.debug('parsing feed at %s.' % url)
            feed = feedparser.parse(urls[url]['data'])
            for entry in feed['entries']:
                updated = entry.get('updated_parsed')
                updated = date(year=updated.tm_year,
                               month=updated.tm_mon,
                               day=updated.tm_mday)
                content = entry.get('content', '')
                summary = entry.get('summary', '')
                summary_detail = entry.get('summary_detail', {})
                if not content:
                    if not (summary_detail and
                            summary_detail.get('value')):
                        if not summary:
                            pass
                        else:
                            content = [{'type': 'text/plain',
                                        'value': summary}]
                    else:
                        content = [summary_detail]
                if content:
                    article = {'url': entry.get('link'),
                               'title': entry.get('title'),
                               'pub_date': updated,
                               'content': content}
                    articles[author].append(article)

def filter_articles(names, articles):
    max_date = date.today()
    min_date = max_date - timedelta(days=3)

    published_authors = [author for author in names
                         if author in articles]

    filtered_articles = {}

    for author in published_authors:
        filtered_articles[author] = [
            {'url': article['url'],
             'title': article['title'],
             'content': article['content'],
             'pubDate': to_date_tuple(article['pub_date'])}
            for article in articles[author]
            if (article['pub_date'] > min_date and
                article['pub_date'] < max_date)
            ]

def publish_edition(update_whoisi=False,
                    update_urls=False,
                    update_articles=False):
    if update_whoisi:
        update_urls = True
    if update_urls:
        update_articles = True

    names = [line.strip()
             for line in open(AUTHORS_FILENAME, 'r').readlines()
             if line and not line.startswith('#')]

    people = load(WHOISI_FILENAME, [])
    wiserver = WhoisiServer()
    wicache = WhoisiCache(wiserver, people)

    if update_whoisi:
        wicache.update()

    following = [person for person in people
                 if person['name'] in names]

    if update_whoisi:
        people_indexes = [people.index(person)
                          for person in following]
        wicache.refresh_people(people_indexes)
        save(people, WHOISI_FILENAME)

    feeds = {}

    for person in following:
        person_feeds = []
        for site in person['sites'].values():
            if site['type'] == 'feed':
                person_feeds.append(site['feed'])
        feeds[person['name']] = person_feeds

    urls = load(URLS_FILENAME, {})

    if update_urls:
        refresh_urls(feeds=feeds, urls=urls)
        save(urls, URLS_FILENAME)

    articles = load(ARTICLES_FILENAME, {})

    if update_articles:
        refresh_articles(articles=articles,
                         feeds=feeds,
                         urls=urls)
        save(articles, ARTICLES_FILENAME)

    filtered_articles = filter_articles(names=names,
                                        articles=articles)

    json.dump({'authors': names, 'articles': filtered_articles},
              open(JSON_FILENAME, 'w'))

    logging.info('wrote %s.' % JSON_FILENAME)

parser_options = {
    ('-w', '--update-whoisi',): 
    dict(dest='update_whoisi',
         help='re-sync with whoisi.com',
         action='store_true',
         default=False),

    ('-u', '--update-urls',): 
    dict(dest='update_urls',
         help='refresh feeds',
         action='store_true',
         default=False),

    ('-a', '--update-articles',): 
    dict(dest='update_articles',
         help='re-parse articles',
         action='store_true',
         default=False),
}

if __name__ == '__main__':
    logging.basicConfig(level=logging.DEBUG)

    parser = optparse.OptionParser()
    for names, opts in parser_options.items():
        parser.add_option(*names, **opts)
    (options, args) = parser.parse_args()

    publish_edition(**options.__dict__)