Mercurial > daily-edition
view publish_edition.py @ 28:6e2038000082
added dry-run cmdline option
author | Atul Varma <varmaa@toolness.com> |
---|---|
date | Sun, 03 Jan 2010 09:20:29 -0800 |
parents | 83091b7b3e59 |
children | 6fbd38dd976a |
line wrap: on
line source
import os import logging import traceback import optparse import cPickle as pickle from datetime import date, datetime, timedelta import feedparser from whoisi_cache import WhoisiServer, WhoisiCache, json from url_cache import UrlCache AUTHORS_FILENAME = 'authors.txt' URLS_FILENAME = 'url_cache.dat' WHOISI_FILENAME = 'whoisi_cache.dat' ARTICLES_FILENAME = 'articles.dat' ISSUES_FILENAME = 'issues.dat' JSON_FILENAME = 'daily-edition.json' def load(filename, default): if os.path.exists(filename): return pickle.load(open(filename, 'r')) return default def save(obj, filename): f = open(filename, 'w') pickle.dump(obj, f) f.close() def to_date_tuple(dt): return (dt.year, dt.month, dt.day) def refresh_urls(feeds, urls): ucache = UrlCache(urls) for feed_urls in feeds.values(): for url in feed_urls: try: ucache.refresh(url) except Exception, e: traceback.print_exc(e) def refresh_articles(articles, feeds, urls): for author, feed_urls in feeds.items(): articles[author] = [] for url in feed_urls: logging.debug('parsing feed at %s.' % url) feed = feedparser.parse(urls[url]['data']) for entry in feed['entries']: updated = entry.get('updated_parsed') updated = date(year=updated.tm_year, month=updated.tm_mon, day=updated.tm_mday) content = entry.get('content', '') summary = entry.get('summary', '') summary_detail = entry.get('summary_detail', {}) if not content: if not (summary_detail and summary_detail.get('value')): if not summary: pass else: content = [{'type': 'text/plain', 'value': summary}] else: content = [summary_detail] if content: article = {'url': entry.get('link'), 'title': entry.get('title'), 'pub_date': updated, 'content': content} articles[author].append(article) def normalize(obj): if isinstance(obj, list): for i in range(len(obj)): obj[i] = normalize(obj[i]) elif isinstance(obj, dict): for key in obj: obj[key] = normalize(obj[key]) elif isinstance(obj, str): try: obj = obj.decode('utf-8') except UnicodeDecodeError, e: logging.warn('error decoding "%s" (%s).' % (repr(obj), e)) obj = obj.decode('utf-8', 'ignore') return obj def filter_articles(names, articles, issues, max_articles_per_author=1, max_word_count=2500, max_article_age=timedelta(days=30)): min_date = date.today() - max_article_age published_authors = [author for author in names if author in articles] filtered_articles = {} words_left = max_word_count for author in published_authors: articles_left = max_articles_per_author potential_articles = [ {'url': article['url'], 'title': article['title'], 'content': article['content'], 'pubDate': to_date_tuple(article['pub_date'])} for article in articles[author] if (article['pub_date'] > min_date and article['url'] not in issues['urls']) ] for article in potential_articles: html = [ctype['value'] for ctype in article['content'] if ctype.get('type') == 'text/html' and ctype.get('value')] if not html: logging.warn('no html content for %s.' % article['url']) elif len(html) > 1: logging.warn('multiple html found for %s.' % article['url']) else: word_count = len(html[0].split()) if word_count < words_left: if author not in filtered_articles: filtered_articles[author] = [] filtered_articles[author].append(article) elif word_count > max_word_count: logging.warn( 'article will never be included in an ' 'issue due to word count: %s (%d words)' % ( article['url'], word_count )) words_left -= word_count articles_left -= 1 if not articles_left: break return normalize(filtered_articles) def publish_edition(update_whoisi=False, update_urls=False, update_articles=False, dry_run=False): if update_whoisi: update_urls = True if update_urls: update_articles = True names = [line.strip() for line in open(AUTHORS_FILENAME, 'r').readlines() if line and not line.startswith('#')] people = load(WHOISI_FILENAME, []) wiserver = WhoisiServer() wicache = WhoisiCache(wiserver, people) if update_whoisi: wicache.update() whoisi_names = [person['name'] for person in people] unknown_names = [name for name in names if name not in whoisi_names] if unknown_names: logging.warn('could not find information on: %s.' % ', '.join(unknown_names)) following = [person for person in people if person['name'] in names] if update_whoisi: people_indexes = [people.index(person) for person in following] wicache.refresh_people(people_indexes) if not dry_run: save(people, WHOISI_FILENAME) feeds = {} for person in following: person_feeds = [] for site in person['sites'].values(): if site['type'] == 'feed': person_feeds.append(site['feed']) feeds[person['name']] = person_feeds urls = load(URLS_FILENAME, {}) if update_urls: refresh_urls(feeds=feeds, urls=urls) if not dry_run: save(urls, URLS_FILENAME) articles = load(ARTICLES_FILENAME, {}) if update_articles: refresh_articles(articles=articles, feeds=feeds, urls=urls) if not dry_run: save(articles, ARTICLES_FILENAME) issues = load(ISSUES_FILENAME, {'urls': {}, 'pub_dates': []}) filtered_articles = filter_articles(names=names, articles=articles, issues=issues) issue_id = len(issues['pub_dates']) issues['pub_dates'].append(datetime.now()) for author in filtered_articles: for article in filtered_articles[author]: issues['urls'][article['url']] = issue_id if not dry_run: save(issues, ISSUES_FILENAME) if not dry_run: json.dump({'authors': names, 'articles': filtered_articles}, open(JSON_FILENAME, 'w')) logging.info('wrote %s (issue #%d).' % (JSON_FILENAME, issue_id)) parser_options = { ('-w', '--refresh-whoisi',): dict(dest='update_whoisi', help='re-sync with whoisi.com', action='store_true', default=False), ('-f', '--refresh-feeds',): dict(dest='update_urls', help='refresh feeds', action='store_true', default=False), ('-p', '--reparse-feeds',): dict(dest='update_articles', help='re-parse feeds', action='store_true', default=False), ('-d', '--dry-run',): dict(dest='dry_run', help='do not write anything to disk', action='store_true', default=False), } if __name__ == '__main__': logging.basicConfig(level=logging.DEBUG) parser = optparse.OptionParser() for names, opts in parser_options.items(): parser.add_option(*names, **opts) (options, args) = parser.parse_args() publish_edition(**options.__dict__)