view publish_edition.py @ 14:4a2499602804

refactorings to publish_edition.py
author Atul Varma <varmaa@toolness.com>
date Sun, 03 Jan 2010 00:27:38 -0800
parents 69fd13a4aef4
children f0dd39b7cbe1
line wrap: on
line source

import os
import logging
import traceback
import cPickle as pickle
from datetime import date, datetime, timedelta

import feedparser
from whoisi_cache import WhoisiServer, WhoisiCache, json
from url_cache import UrlCache

AUTHORS_FILENAME = 'authors.txt'
URLS_FILENAME = 'ucache.pickle'
WHOISI_FILENAME = 'wicache.pickle'
ARTICLES_FILENAME = 'articles.pickle'
JSON_FILENAME = 'daily-edition.json'

def load(filename, default):
    if os.path.exists(filename):    
        return pickle.load(open(filename, 'r'))
    return default

def save(obj, filename):
    f = open(filename, 'w')
    pickle.dump(obj, f)
    f.close()

def to_date_tuple(dt):
    return (dt.year, dt.month, dt.day)

def publish_edition(update_whoisi=False,
                    update_urls=False,
                    update_articles=False):
    names = [line.strip()
             for line in open(AUTHORS_FILENAME, 'r').readlines()
             if line and not line.startswith('#')]

    urls = load(URLS_FILENAME, {})
    ucache = UrlCache(urls)

    people = load(WHOISI_FILENAME, [])
    wiserver = WhoisiServer()
    wicache = WhoisiCache(wiserver, people)

    if update_whoisi:
        wicache.update()
        save(people, WHOISI_FILENAME)

    #wicache.refresh_people([people.index(person)])

    following = [person for person in people
                 if person['name'] in names]

    feeds = {}

    for person in following:
        person_feeds = []
        for site in person['sites'].values():
            if site['type'] == 'feed':
                person_feeds.append(site['feed'])
        feeds[person['name']] = person_feeds

    if update_urls:
        for feed_urls in feeds.values():
            for url in feed_urls:
                try:
                    ucache.refresh(url)
                except Exception, e:
                    traceback.print_exc(e)
        save(urls, URLS_FILENAME)

    articles = load(ARTICLES_FILENAME, {})

    if update_articles:
        for author, feed_urls in feeds.items():
            articles[author] = []
            for url in feed_urls:
                feed = feedparser.parse(urls[url]['data'])
                for entry in feed['entries']:
                    updated = entry.get('updated_parsed')
                    updated = date(year=updated.tm_year,
                                   month=updated.tm_mon,
                                   day=updated.tm_mday)
                    content = entry.get('content', '')
                    summary = entry.get('summary', '')
                    summary_detail = entry.get('summary_detail', {})
                    if not content:
                        if not (summary_detail and
                                summary_detail.get('value')):
                            if not summary:
                                pass
                            else:
                                content = [{'type': 'text/plain',
                                            'value': summary}]
                        else:
                            content = [summary_detail]
                    if content:
                        article = {'url': entry.get('link'),
                                   'title': entry.get('title'),
                                   'pub_date': updated,
                                   'content': content}
                        articles[author].append(article)
        save(articles, ARTICLES_FILENAME)

    max_date = date.today()
    min_date = max_date - timedelta(days=3)

    published_authors = [author for author in names
                         if author in articles]

    filtered_articles = {}

    for author in published_authors:
        filtered_articles[author] = [
            {'url': article['url'],
             'title': article['title'],
             'content': article['content'],
             'pubDate': to_date_tuple(article['pub_date'])}
            for article in articles[author]
            if (article['pub_date'] > min_date and
                article['pub_date'] < max_date)
            ]

    json.dump({'authors': names, 'articles': filtered_articles},
              open(JSON_FILENAME, 'w'))

    logging.info('wrote %s.' % JSON_FILENAME)

if __name__ == '__main__':
    logging.basicConfig(level=logging.DEBUG)
    publish_edition()