view publish_edition.py @ 41:faf4b49c5e6e

fixed a major, glaring bug.
author Atul Varma <varmaa@toolness.com>
date Wed, 20 Jan 2010 07:46:51 -0800
parents 192bf3100269
children d9a6d70d9711
line wrap: on
line source

import os
import logging
import traceback
import optparse
import cPickle as pickle
from datetime import date, datetime, timedelta

import feedparser
from whoisi_cache import WhoisiServer, WhoisiCache, json
from url_cache import UrlCache

AUTHORS_FILENAME = 'authors.txt'
URLS_FILENAME = 'url_cache.dat'
WHOISI_FILENAME = 'whoisi_cache.dat'
ARTICLES_FILENAME = 'articles.dat'
ISSUES_FILENAME = 'issues.dat'
JSON_FILENAME = 'daily-edition.json'

def load(filename, default):
    if os.path.exists(filename):    
        return pickle.load(open(filename, 'r'))
    return default

def save(obj, filename):
    f = open(filename, 'w')
    pickle.dump(obj, f)
    f.close()

def to_date_tuple(dt):
    return (dt.year, dt.month, dt.day)

def refresh_urls(feeds, urls):
    ucache = UrlCache(urls)
    for feed_urls in feeds.values():
        for url in feed_urls:
            try:
                ucache.refresh(url)
            except Exception, e:
                traceback.print_exc(e)

def refresh_articles(articles, feeds, urls):
    for author, feed_urls in feeds.items():
        articles[author] = []
        for url in feed_urls:
            logging.debug('parsing feed at %s.' % url)
            feed = feedparser.parse(urls[url]['data'])
            for entry in feed['entries']:
                updated = entry.get('updated_parsed')
                updated = date(year=updated.tm_year,
                               month=updated.tm_mon,
                               day=updated.tm_mday)
                content = entry.get('content', '')
                summary = entry.get('summary', '')
                summary_detail = entry.get('summary_detail', {})
                if not content:
                    if not (summary_detail and
                            summary_detail.get('value')):
                        if not summary:
                            pass
                        else:
                            content = [{'type': 'text/plain',
                                        'value': summary}]
                    else:
                        content = [summary_detail]
                if content:
                    article = {'url': entry.get('link'),
                               'title': entry.get('title'),
                               'pub_date': updated,
                               'content': content}
                    articles[author].append(article)

def normalize(obj):
    if isinstance(obj, list):
        for i in range(len(obj)):
            obj[i] = normalize(obj[i])
    elif isinstance(obj, dict):
        for key in obj:
            obj[key] = normalize(obj[key])
    elif isinstance(obj, str):
        try:
            obj = obj.decode('utf-8')
        except UnicodeDecodeError, e:
            logging.warn('error decoding "%s" (%s).' % (repr(obj), e))
            obj = obj.decode('utf-8', 'ignore')
    return obj

def filter_articles(names, articles, issues,
                    max_articles_per_author=1,
                    min_article_word_count=50,
                    max_article_age=timedelta(days=15),
                    max_issue_word_count=2500):
    min_date = date.today() - max_article_age

    published_authors = [author for author in names
                         if author in articles]

    filtered_articles = {}
    words_left = max_issue_word_count

    total_potentials = 0
    total_articles = 0
    total_word_count = 0

    for author in published_authors:
        articles_left = max_articles_per_author
        potential_articles = [
            {'url': article['url'],
             'title': article['title'],
             'content': article['content'],
             'pubDate': to_date_tuple(article['pub_date'])}
            for article in articles[author]
            if (article['pub_date'] > min_date
                and article['url'] not in issues['urls']
                and article['url'].startswith('http'))
            ]

        total_potentials += len(potential_articles)

        for article in potential_articles:
            html = [ctype['value']
                    for ctype in article['content']
                    if ctype.get('type') == 'text/html'
                    and ctype.get('value')]
            if not html:
                logging.warn('no html content for %s.' % article['url'])
            elif len(html) > 1:
                logging.warn('multiple html found for %s.' % article['url'])
            else:
                word_count = len(html[0].split())
                if (word_count > min_article_word_count and
                    word_count < words_left):
                    if author not in filtered_articles:
                        filtered_articles[author] = []
                    total_word_count += word_count
                    total_articles += 1
                    filtered_articles[author].append(article)
                    words_left -= word_count
                    articles_left -= 1
                    if not articles_left:
                        break
                elif word_count > max_issue_word_count:
                    logging.warn(
                        'article will never be included in an '
                        'issue due to word count: %s (%d words)' % (
                            article['url'],
                            word_count
                            ))

    logging.debug('found %d articles (out of a potential %d), totalling '
                  '%d words, with contibutions by %s.' % 
                  (total_articles, total_potentials, total_word_count,
                   ', '.join(filtered_articles.keys())))
    return normalize(filtered_articles)

def publish_edition(update_whoisi=False,
                    update_urls=False,
                    update_articles=False,
                    dry_run=False):
    if update_whoisi:
        update_urls = True
    if update_urls:
        update_articles = True

    names = [line.strip()
             for line in open(AUTHORS_FILENAME, 'r').readlines()
             if line and not line.startswith('#')]

    people = load(WHOISI_FILENAME, [])
    wiserver = WhoisiServer()
    wicache = WhoisiCache(wiserver, people)

    if update_whoisi:
        wicache.update()

    whoisi_names = [person['name'] for person in people]
    unknown_names = [name for name in names
                     if name not in whoisi_names]
    if unknown_names:
        logging.warn('could not find information on: %s.' % 
                     ', '.join(unknown_names))

    following = [person for person in people
                 if person['name'] in names]

    if update_whoisi:
        people_indexes = [people.index(person)
                          for person in following]
        wicache.refresh_people(people_indexes)
        if not dry_run:
            save(people, WHOISI_FILENAME)

    feeds = {}

    for person in following:
        person_feeds = []
        for site in person['sites'].values():
            if site['type'] == 'feed':
                person_feeds.append(site['feed'])
        feeds[person['name']] = person_feeds

    urls = load(URLS_FILENAME, {})

    if update_urls:
        refresh_urls(feeds=feeds, urls=urls)
        if not dry_run:
            save(urls, URLS_FILENAME)

    articles = load(ARTICLES_FILENAME, {})

    if update_articles:
        refresh_articles(articles=articles,
                         feeds=feeds,
                         urls=urls)
        if not dry_run:
            save(articles, ARTICLES_FILENAME)

    issues = load(ISSUES_FILENAME, {'urls': {},
                                    'pub_dates': []})

    filtered_articles = filter_articles(names=names,
                                        articles=articles,
                                        issues=issues)

    issue_id = len(issues['pub_dates'])
    issues['pub_dates'].append(datetime.now())
    for author in filtered_articles:
        for article in filtered_articles[author]:
            issues['urls'][article['url']] = issue_id
    if not dry_run:
        save(issues, ISSUES_FILENAME)

    if not dry_run:
        json.dump({'id': issue_id,
                   'authors': names,
                   'articles': filtered_articles,
                   'pubDate': to_date_tuple(date.today())},
                  open(JSON_FILENAME, 'w'))

        logging.info('wrote %s (issue #%d).' % (JSON_FILENAME, issue_id+1))

parser_options = {
    ('-w', '--refresh-whoisi',): 
    dict(dest='update_whoisi',
         help='re-sync with whoisi.com',
         action='store_true',
         default=False),

    ('-f', '--refresh-feeds',): 
    dict(dest='update_urls',
         help='refresh feeds',
         action='store_true',
         default=False),

    ('-p', '--reparse-feeds',): 
    dict(dest='update_articles',
         help='re-parse feeds',
         action='store_true',
         default=False),

    ('-d', '--dry-run',): 
    dict(dest='dry_run',
         help='do not write anything to disk',
         action='store_true',
         default=False),
}

if __name__ == '__main__':
    logging.basicConfig(level=logging.DEBUG)

    parser = optparse.OptionParser()
    for names, opts in parser_options.items():
        parser.add_option(*names, **opts)
    (options, args) = parser.parse_args()

    publish_edition(**options.__dict__)