Mercurial > daily-edition
changeset 24:2b4cf6903012
implemented a basic algorithm for filtering articles.
author | Atul Varma <varmaa@toolness.com> |
---|---|
date | Sun, 03 Jan 2010 06:19:59 -0800 |
parents | 8b501dfe7d85 |
children | 35a52998884f |
files | publish_edition.py |
diffstat | 1 files changed, 41 insertions(+), 7 deletions(-) [+] |
line wrap: on
line diff
--- a/publish_edition.py Sun Jan 03 05:01:38 2010 -0800 +++ b/publish_edition.py Sun Jan 03 06:19:59 2010 -0800 @@ -69,27 +69,61 @@ 'content': content} articles[author].append(article) -def filter_articles(names, articles, issues): - max_date = date.today() - min_date = max_date - timedelta(days=3) +def normalize(obj): + if isinstance(obj, list): + for i in range(len(obj)): + obj[i] = normalize(obj[i]) + elif isinstance(obj, dict): + for key in obj: + obj[key] = normalize(obj[key]) + elif isinstance(obj, str): + obj = obj.decode('utf-8', 'ignore') + return obj + +def filter_articles(names, articles, issues, + max_articles_per_author=1, + max_word_count=2500, + max_article_age=timedelta(days=30)): + min_date = date.today() - max_article_age published_authors = [author for author in names if author in articles] filtered_articles = {} + words_left = max_word_count for author in published_authors: - filtered_articles[author] = [ + articles_left = max_articles_per_author + potential_articles = [ {'url': article['url'], 'title': article['title'], 'content': article['content'], 'pubDate': to_date_tuple(article['pub_date'])} for article in articles[author] - if (article['pub_date'] > min_date and - article['pub_date'] < max_date) + if (article['pub_date'] > min_date + and article['url'] not in issues['urls']) ] - return filtered_articles + for article in potential_articles: + html = [ctype['value'] + for ctype in article['content'] + if ctype.get('type') == 'text/html'] + if not html: + logging.warn('no html content for %s.' % article['url']) + elif len(html) > 1: + logging.warn('multiple html found for %s.' % article['url']) + else: + word_count = len(html[0].split()) + if word_count < words_left: + if author not in filtered_articles: + filtered_articles[author] = [] + filtered_articles[author].append(article) + words_left -= word_count + articles_left -= 1 + if not articles_left: + break + + return normalize(filtered_articles) def publish_edition(update_whoisi=False, update_urls=False,