Mercurial > daily-edition
view publish_edition.py @ 21:6a6632954dc6
added logging statement while parsing feeds
author | Atul Varma <varmaa@toolness.com> |
---|---|
date | Sun, 03 Jan 2010 04:41:44 -0800 |
parents | 5e7ecaf68b9a |
children | f2fafca30ff3 |
line wrap: on
line source
import os import logging import traceback import optparse import cPickle as pickle from datetime import date, datetime, timedelta import feedparser from whoisi_cache import WhoisiServer, WhoisiCache, json from url_cache import UrlCache AUTHORS_FILENAME = 'authors.txt' URLS_FILENAME = 'url_cache.dat' WHOISI_FILENAME = 'whoisi_cache.dat' ARTICLES_FILENAME = 'articles.dat' JSON_FILENAME = 'daily-edition.json' def load(filename, default): if os.path.exists(filename): return pickle.load(open(filename, 'r')) return default def save(obj, filename): f = open(filename, 'w') pickle.dump(obj, f) f.close() def to_date_tuple(dt): return (dt.year, dt.month, dt.day) def refresh_urls(feeds, urls): ucache = UrlCache(urls) for feed_urls in feeds.values(): for url in feed_urls: try: ucache.refresh(url) except Exception, e: traceback.print_exc(e) def refresh_articles(articles, feeds, urls): for author, feed_urls in feeds.items(): articles[author] = [] for url in feed_urls: logging.debug('parsing feed at %s.' % url) feed = feedparser.parse(urls[url]['data']) for entry in feed['entries']: updated = entry.get('updated_parsed') updated = date(year=updated.tm_year, month=updated.tm_mon, day=updated.tm_mday) content = entry.get('content', '') summary = entry.get('summary', '') summary_detail = entry.get('summary_detail', {}) if not content: if not (summary_detail and summary_detail.get('value')): if not summary: pass else: content = [{'type': 'text/plain', 'value': summary}] else: content = [summary_detail] if content: article = {'url': entry.get('link'), 'title': entry.get('title'), 'pub_date': updated, 'content': content} articles[author].append(article) def filter_articles(names, articles): max_date = date.today() min_date = max_date - timedelta(days=3) published_authors = [author for author in names if author in articles] filtered_articles = {} for author in published_authors: filtered_articles[author] = [ {'url': article['url'], 'title': article['title'], 'content': article['content'], 'pubDate': to_date_tuple(article['pub_date'])} for article in articles[author] if (article['pub_date'] > min_date and article['pub_date'] < max_date) ] def publish_edition(update_whoisi=False, update_urls=False, update_articles=False): if update_whoisi: update_urls = True if update_urls: update_articles = True names = [line.strip() for line in open(AUTHORS_FILENAME, 'r').readlines() if line and not line.startswith('#')] people = load(WHOISI_FILENAME, []) wiserver = WhoisiServer() wicache = WhoisiCache(wiserver, people) if update_whoisi: wicache.update() following = [person for person in people if person['name'] in names] if update_whoisi: people_indexes = [people.index(person) for person in following] wicache.refresh_people(people_indexes) save(people, WHOISI_FILENAME) feeds = {} for person in following: person_feeds = [] for site in person['sites'].values(): if site['type'] == 'feed': person_feeds.append(site['feed']) feeds[person['name']] = person_feeds urls = load(URLS_FILENAME, {}) if update_urls: refresh_urls(feeds=feeds, urls=urls) save(urls, URLS_FILENAME) articles = load(ARTICLES_FILENAME, {}) if update_articles: refresh_articles(articles=articles, feeds=feeds, urls=urls) save(articles, ARTICLES_FILENAME) filtered_articles = filter_articles(names=names, articles=articles) json.dump({'authors': names, 'articles': filtered_articles}, open(JSON_FILENAME, 'w')) logging.info('wrote %s.' % JSON_FILENAME) parser_options = { ('-w', '--update-whoisi',): dict(dest='update_whoisi', help='re-sync with whoisi.com', action='store_true', default=False), ('-u', '--update-urls',): dict(dest='update_urls', help='refresh feeds', action='store_true', default=False), ('-a', '--update-articles',): dict(dest='update_articles', help='re-parse articles', action='store_true', default=False), } if __name__ == '__main__': logging.basicConfig(level=logging.DEBUG) parser = optparse.OptionParser() for names, opts in parser_options.items(): parser.add_option(*names, **opts) (options, args) = parser.parse_args() publish_edition(**options.__dict__)