changeset 24:2b4cf6903012

implemented a basic algorithm for filtering articles.
author Atul Varma <varmaa@toolness.com>
date Sun, 03 Jan 2010 06:19:59 -0800
parents 8b501dfe7d85
children 35a52998884f
files publish_edition.py
diffstat 1 files changed, 41 insertions(+), 7 deletions(-) [+]
line wrap: on
line diff
--- a/publish_edition.py	Sun Jan 03 05:01:38 2010 -0800
+++ b/publish_edition.py	Sun Jan 03 06:19:59 2010 -0800
@@ -69,27 +69,61 @@
                                'content': content}
                     articles[author].append(article)
 
-def filter_articles(names, articles, issues):
-    max_date = date.today()
-    min_date = max_date - timedelta(days=3)
+def normalize(obj):
+    if isinstance(obj, list):
+        for i in range(len(obj)):
+            obj[i] = normalize(obj[i])
+    elif isinstance(obj, dict):
+        for key in obj:
+            obj[key] = normalize(obj[key])
+    elif isinstance(obj, str):
+        obj = obj.decode('utf-8', 'ignore')
+    return obj
+
+def filter_articles(names, articles, issues,
+                    max_articles_per_author=1,
+                    max_word_count=2500,
+                    max_article_age=timedelta(days=30)):
+    min_date = date.today() - max_article_age
 
     published_authors = [author for author in names
                          if author in articles]
 
     filtered_articles = {}
+    words_left = max_word_count
 
     for author in published_authors:
-        filtered_articles[author] = [
+        articles_left = max_articles_per_author
+        potential_articles = [
             {'url': article['url'],
              'title': article['title'],
              'content': article['content'],
              'pubDate': to_date_tuple(article['pub_date'])}
             for article in articles[author]
-            if (article['pub_date'] > min_date and
-                article['pub_date'] < max_date)
+            if (article['pub_date'] > min_date
+                and article['url'] not in issues['urls'])
             ]
 
-    return filtered_articles
+        for article in potential_articles:
+            html = [ctype['value']
+                    for ctype in article['content']
+                    if ctype.get('type') == 'text/html']
+            if not html:
+                logging.warn('no html content for %s.' % article['url'])
+            elif len(html) > 1:
+                logging.warn('multiple html found for %s.' % article['url'])
+            else:
+                word_count = len(html[0].split())
+                if word_count < words_left:
+                    if author not in filtered_articles:
+                        filtered_articles[author] = []
+                    filtered_articles[author].append(article)
+                words_left -= word_count
+                articles_left -= 1
+                if not articles_left:
+                    break
+
+    return normalize(filtered_articles)
 
 def publish_edition(update_whoisi=False,
                     update_urls=False,