annotate publish_edition.py @ 28:6e2038000082

added dry-run cmdline option
author Atul Varma <varmaa@toolness.com>
date Sun, 03 Jan 2010 09:20:29 -0800
parents 83091b7b3e59
children 6fbd38dd976a
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
13
69fd13a4aef4 Added an html viewer and a publish_edition.py script that generates the JSON which the html viewer reads.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
1 import os
69fd13a4aef4 Added an html viewer and a publish_edition.py script that generates the JSON which the html viewer reads.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
2 import logging
69fd13a4aef4 Added an html viewer and a publish_edition.py script that generates the JSON which the html viewer reads.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
3 import traceback
15
f0dd39b7cbe1 added cmdline options
Atul Varma <varmaa@toolness.com>
parents: 14
diff changeset
4 import optparse
13
69fd13a4aef4 Added an html viewer and a publish_edition.py script that generates the JSON which the html viewer reads.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
5 import cPickle as pickle
69fd13a4aef4 Added an html viewer and a publish_edition.py script that generates the JSON which the html viewer reads.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
6 from datetime import date, datetime, timedelta
69fd13a4aef4 Added an html viewer and a publish_edition.py script that generates the JSON which the html viewer reads.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
7
69fd13a4aef4 Added an html viewer and a publish_edition.py script that generates the JSON which the html viewer reads.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
8 import feedparser
69fd13a4aef4 Added an html viewer and a publish_edition.py script that generates the JSON which the html viewer reads.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
9 from whoisi_cache import WhoisiServer, WhoisiCache, json
69fd13a4aef4 Added an html viewer and a publish_edition.py script that generates the JSON which the html viewer reads.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
10 from url_cache import UrlCache
69fd13a4aef4 Added an html viewer and a publish_edition.py script that generates the JSON which the html viewer reads.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
11
69fd13a4aef4 Added an html viewer and a publish_edition.py script that generates the JSON which the html viewer reads.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
12 AUTHORS_FILENAME = 'authors.txt'
20
5e7ecaf68b9a renamed .pickle files to .dat
Atul Varma <varmaa@toolness.com>
parents: 19
diff changeset
13 URLS_FILENAME = 'url_cache.dat'
5e7ecaf68b9a renamed .pickle files to .dat
Atul Varma <varmaa@toolness.com>
parents: 19
diff changeset
14 WHOISI_FILENAME = 'whoisi_cache.dat'
5e7ecaf68b9a renamed .pickle files to .dat
Atul Varma <varmaa@toolness.com>
parents: 19
diff changeset
15 ARTICLES_FILENAME = 'articles.dat'
22
f2fafca30ff3 added issue tracking information, though it's not yet being used in any way.
Atul Varma <varmaa@toolness.com>
parents: 21
diff changeset
16 ISSUES_FILENAME = 'issues.dat'
13
69fd13a4aef4 Added an html viewer and a publish_edition.py script that generates the JSON which the html viewer reads.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
17 JSON_FILENAME = 'daily-edition.json'
69fd13a4aef4 Added an html viewer and a publish_edition.py script that generates the JSON which the html viewer reads.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
18
69fd13a4aef4 Added an html viewer and a publish_edition.py script that generates the JSON which the html viewer reads.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
19 def load(filename, default):
69fd13a4aef4 Added an html viewer and a publish_edition.py script that generates the JSON which the html viewer reads.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
20 if os.path.exists(filename):
69fd13a4aef4 Added an html viewer and a publish_edition.py script that generates the JSON which the html viewer reads.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
21 return pickle.load(open(filename, 'r'))
69fd13a4aef4 Added an html viewer and a publish_edition.py script that generates the JSON which the html viewer reads.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
22 return default
69fd13a4aef4 Added an html viewer and a publish_edition.py script that generates the JSON which the html viewer reads.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
23
69fd13a4aef4 Added an html viewer and a publish_edition.py script that generates the JSON which the html viewer reads.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
24 def save(obj, filename):
69fd13a4aef4 Added an html viewer and a publish_edition.py script that generates the JSON which the html viewer reads.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
25 f = open(filename, 'w')
69fd13a4aef4 Added an html viewer and a publish_edition.py script that generates the JSON which the html viewer reads.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
26 pickle.dump(obj, f)
69fd13a4aef4 Added an html viewer and a publish_edition.py script that generates the JSON which the html viewer reads.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
27 f.close()
69fd13a4aef4 Added an html viewer and a publish_edition.py script that generates the JSON which the html viewer reads.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
28
69fd13a4aef4 Added an html viewer and a publish_edition.py script that generates the JSON which the html viewer reads.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
29 def to_date_tuple(dt):
69fd13a4aef4 Added an html viewer and a publish_edition.py script that generates the JSON which the html viewer reads.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
30 return (dt.year, dt.month, dt.day)
69fd13a4aef4 Added an html viewer and a publish_edition.py script that generates the JSON which the html viewer reads.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
31
19
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
32 def refresh_urls(feeds, urls):
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
33 ucache = UrlCache(urls)
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
34 for feed_urls in feeds.values():
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
35 for url in feed_urls:
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
36 try:
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
37 ucache.refresh(url)
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
38 except Exception, e:
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
39 traceback.print_exc(e)
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
40
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
41 def refresh_articles(articles, feeds, urls):
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
42 for author, feed_urls in feeds.items():
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
43 articles[author] = []
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
44 for url in feed_urls:
21
6a6632954dc6 added logging statement while parsing feeds
Atul Varma <varmaa@toolness.com>
parents: 20
diff changeset
45 logging.debug('parsing feed at %s.' % url)
19
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
46 feed = feedparser.parse(urls[url]['data'])
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
47 for entry in feed['entries']:
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
48 updated = entry.get('updated_parsed')
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
49 updated = date(year=updated.tm_year,
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
50 month=updated.tm_mon,
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
51 day=updated.tm_mday)
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
52 content = entry.get('content', '')
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
53 summary = entry.get('summary', '')
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
54 summary_detail = entry.get('summary_detail', {})
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
55 if not content:
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
56 if not (summary_detail and
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
57 summary_detail.get('value')):
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
58 if not summary:
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
59 pass
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
60 else:
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
61 content = [{'type': 'text/plain',
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
62 'value': summary}]
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
63 else:
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
64 content = [summary_detail]
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
65 if content:
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
66 article = {'url': entry.get('link'),
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
67 'title': entry.get('title'),
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
68 'pub_date': updated,
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
69 'content': content}
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
70 articles[author].append(article)
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
71
24
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
72 def normalize(obj):
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
73 if isinstance(obj, list):
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
74 for i in range(len(obj)):
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
75 obj[i] = normalize(obj[i])
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
76 elif isinstance(obj, dict):
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
77 for key in obj:
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
78 obj[key] = normalize(obj[key])
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
79 elif isinstance(obj, str):
26
197980874dd4 more warning feedback
Atul Varma <varmaa@toolness.com>
parents: 25
diff changeset
80 try:
197980874dd4 more warning feedback
Atul Varma <varmaa@toolness.com>
parents: 25
diff changeset
81 obj = obj.decode('utf-8')
197980874dd4 more warning feedback
Atul Varma <varmaa@toolness.com>
parents: 25
diff changeset
82 except UnicodeDecodeError, e:
197980874dd4 more warning feedback
Atul Varma <varmaa@toolness.com>
parents: 25
diff changeset
83 logging.warn('error decoding "%s" (%s).' % (repr(obj), e))
197980874dd4 more warning feedback
Atul Varma <varmaa@toolness.com>
parents: 25
diff changeset
84 obj = obj.decode('utf-8', 'ignore')
24
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
85 return obj
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
86
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
87 def filter_articles(names, articles, issues,
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
88 max_articles_per_author=1,
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
89 max_word_count=2500,
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
90 max_article_age=timedelta(days=30)):
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
91 min_date = date.today() - max_article_age
19
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
92
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
93 published_authors = [author for author in names
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
94 if author in articles]
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
95
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
96 filtered_articles = {}
24
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
97 words_left = max_word_count
19
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
98
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
99 for author in published_authors:
24
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
100 articles_left = max_articles_per_author
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
101 potential_articles = [
19
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
102 {'url': article['url'],
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
103 'title': article['title'],
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
104 'content': article['content'],
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
105 'pubDate': to_date_tuple(article['pub_date'])}
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
106 for article in articles[author]
24
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
107 if (article['pub_date'] > min_date
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
108 and article['url'] not in issues['urls'])
19
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
109 ]
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
110
24
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
111 for article in potential_articles:
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
112 html = [ctype['value']
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
113 for ctype in article['content']
25
35a52998884f added more warnings
Atul Varma <varmaa@toolness.com>
parents: 24
diff changeset
114 if ctype.get('type') == 'text/html'
35a52998884f added more warnings
Atul Varma <varmaa@toolness.com>
parents: 24
diff changeset
115 and ctype.get('value')]
24
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
116 if not html:
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
117 logging.warn('no html content for %s.' % article['url'])
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
118 elif len(html) > 1:
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
119 logging.warn('multiple html found for %s.' % article['url'])
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
120 else:
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
121 word_count = len(html[0].split())
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
122 if word_count < words_left:
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
123 if author not in filtered_articles:
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
124 filtered_articles[author] = []
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
125 filtered_articles[author].append(article)
25
35a52998884f added more warnings
Atul Varma <varmaa@toolness.com>
parents: 24
diff changeset
126 elif word_count > max_word_count:
35a52998884f added more warnings
Atul Varma <varmaa@toolness.com>
parents: 24
diff changeset
127 logging.warn(
35a52998884f added more warnings
Atul Varma <varmaa@toolness.com>
parents: 24
diff changeset
128 'article will never be included in an '
35a52998884f added more warnings
Atul Varma <varmaa@toolness.com>
parents: 24
diff changeset
129 'issue due to word count: %s (%d words)' % (
35a52998884f added more warnings
Atul Varma <varmaa@toolness.com>
parents: 24
diff changeset
130 article['url'],
35a52998884f added more warnings
Atul Varma <varmaa@toolness.com>
parents: 24
diff changeset
131 word_count
35a52998884f added more warnings
Atul Varma <varmaa@toolness.com>
parents: 24
diff changeset
132 ))
24
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
133 words_left -= word_count
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
134 articles_left -= 1
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
135 if not articles_left:
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
136 break
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
137
2b4cf6903012 implemented a basic algorithm for filtering articles.
Atul Varma <varmaa@toolness.com>
parents: 23
diff changeset
138 return normalize(filtered_articles)
22
f2fafca30ff3 added issue tracking information, though it's not yet being used in any way.
Atul Varma <varmaa@toolness.com>
parents: 21
diff changeset
139
14
4a2499602804 refactorings to publish_edition.py
Atul Varma <varmaa@toolness.com>
parents: 13
diff changeset
140 def publish_edition(update_whoisi=False,
4a2499602804 refactorings to publish_edition.py
Atul Varma <varmaa@toolness.com>
parents: 13
diff changeset
141 update_urls=False,
28
6e2038000082 added dry-run cmdline option
Atul Varma <varmaa@toolness.com>
parents: 27
diff changeset
142 update_articles=False,
6e2038000082 added dry-run cmdline option
Atul Varma <varmaa@toolness.com>
parents: 27
diff changeset
143 dry_run=False):
17
6e01f65bb3e8 some cmdline options now imply others
Atul Varma <varmaa@toolness.com>
parents: 16
diff changeset
144 if update_whoisi:
6e01f65bb3e8 some cmdline options now imply others
Atul Varma <varmaa@toolness.com>
parents: 16
diff changeset
145 update_urls = True
6e01f65bb3e8 some cmdline options now imply others
Atul Varma <varmaa@toolness.com>
parents: 16
diff changeset
146 if update_urls:
6e01f65bb3e8 some cmdline options now imply others
Atul Varma <varmaa@toolness.com>
parents: 16
diff changeset
147 update_articles = True
6e01f65bb3e8 some cmdline options now imply others
Atul Varma <varmaa@toolness.com>
parents: 16
diff changeset
148
14
4a2499602804 refactorings to publish_edition.py
Atul Varma <varmaa@toolness.com>
parents: 13
diff changeset
149 names = [line.strip()
4a2499602804 refactorings to publish_edition.py
Atul Varma <varmaa@toolness.com>
parents: 13
diff changeset
150 for line in open(AUTHORS_FILENAME, 'r').readlines()
4a2499602804 refactorings to publish_edition.py
Atul Varma <varmaa@toolness.com>
parents: 13
diff changeset
151 if line and not line.startswith('#')]
4a2499602804 refactorings to publish_edition.py
Atul Varma <varmaa@toolness.com>
parents: 13
diff changeset
152
4a2499602804 refactorings to publish_edition.py
Atul Varma <varmaa@toolness.com>
parents: 13
diff changeset
153 people = load(WHOISI_FILENAME, [])
4a2499602804 refactorings to publish_edition.py
Atul Varma <varmaa@toolness.com>
parents: 13
diff changeset
154 wiserver = WhoisiServer()
4a2499602804 refactorings to publish_edition.py
Atul Varma <varmaa@toolness.com>
parents: 13
diff changeset
155 wicache = WhoisiCache(wiserver, people)
4a2499602804 refactorings to publish_edition.py
Atul Varma <varmaa@toolness.com>
parents: 13
diff changeset
156
4a2499602804 refactorings to publish_edition.py
Atul Varma <varmaa@toolness.com>
parents: 13
diff changeset
157 if update_whoisi:
4a2499602804 refactorings to publish_edition.py
Atul Varma <varmaa@toolness.com>
parents: 13
diff changeset
158 wicache.update()
13
69fd13a4aef4 Added an html viewer and a publish_edition.py script that generates the JSON which the html viewer reads.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
159
27
83091b7b3e59 now display warning information on unknown people
Atul Varma <varmaa@toolness.com>
parents: 26
diff changeset
160 whoisi_names = [person['name'] for person in people]
83091b7b3e59 now display warning information on unknown people
Atul Varma <varmaa@toolness.com>
parents: 26
diff changeset
161 unknown_names = [name for name in names
83091b7b3e59 now display warning information on unknown people
Atul Varma <varmaa@toolness.com>
parents: 26
diff changeset
162 if name not in whoisi_names]
83091b7b3e59 now display warning information on unknown people
Atul Varma <varmaa@toolness.com>
parents: 26
diff changeset
163 if unknown_names:
83091b7b3e59 now display warning information on unknown people
Atul Varma <varmaa@toolness.com>
parents: 26
diff changeset
164 logging.warn('could not find information on: %s.' %
83091b7b3e59 now display warning information on unknown people
Atul Varma <varmaa@toolness.com>
parents: 26
diff changeset
165 ', '.join(unknown_names))
83091b7b3e59 now display warning information on unknown people
Atul Varma <varmaa@toolness.com>
parents: 26
diff changeset
166
14
4a2499602804 refactorings to publish_edition.py
Atul Varma <varmaa@toolness.com>
parents: 13
diff changeset
167 following = [person for person in people
4a2499602804 refactorings to publish_edition.py
Atul Varma <varmaa@toolness.com>
parents: 13
diff changeset
168 if person['name'] in names]
4a2499602804 refactorings to publish_edition.py
Atul Varma <varmaa@toolness.com>
parents: 13
diff changeset
169
16
d74597d6ae5a updating whoisi db also includes updating entries for all 'following' now.
Atul Varma <varmaa@toolness.com>
parents: 15
diff changeset
170 if update_whoisi:
d74597d6ae5a updating whoisi db also includes updating entries for all 'following' now.
Atul Varma <varmaa@toolness.com>
parents: 15
diff changeset
171 people_indexes = [people.index(person)
d74597d6ae5a updating whoisi db also includes updating entries for all 'following' now.
Atul Varma <varmaa@toolness.com>
parents: 15
diff changeset
172 for person in following]
d74597d6ae5a updating whoisi db also includes updating entries for all 'following' now.
Atul Varma <varmaa@toolness.com>
parents: 15
diff changeset
173 wicache.refresh_people(people_indexes)
28
6e2038000082 added dry-run cmdline option
Atul Varma <varmaa@toolness.com>
parents: 27
diff changeset
174 if not dry_run:
6e2038000082 added dry-run cmdline option
Atul Varma <varmaa@toolness.com>
parents: 27
diff changeset
175 save(people, WHOISI_FILENAME)
16
d74597d6ae5a updating whoisi db also includes updating entries for all 'following' now.
Atul Varma <varmaa@toolness.com>
parents: 15
diff changeset
176
14
4a2499602804 refactorings to publish_edition.py
Atul Varma <varmaa@toolness.com>
parents: 13
diff changeset
177 feeds = {}
13
69fd13a4aef4 Added an html viewer and a publish_edition.py script that generates the JSON which the html viewer reads.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
178
14
4a2499602804 refactorings to publish_edition.py
Atul Varma <varmaa@toolness.com>
parents: 13
diff changeset
179 for person in following:
4a2499602804 refactorings to publish_edition.py
Atul Varma <varmaa@toolness.com>
parents: 13
diff changeset
180 person_feeds = []
4a2499602804 refactorings to publish_edition.py
Atul Varma <varmaa@toolness.com>
parents: 13
diff changeset
181 for site in person['sites'].values():
4a2499602804 refactorings to publish_edition.py
Atul Varma <varmaa@toolness.com>
parents: 13
diff changeset
182 if site['type'] == 'feed':
4a2499602804 refactorings to publish_edition.py
Atul Varma <varmaa@toolness.com>
parents: 13
diff changeset
183 person_feeds.append(site['feed'])
4a2499602804 refactorings to publish_edition.py
Atul Varma <varmaa@toolness.com>
parents: 13
diff changeset
184 feeds[person['name']] = person_feeds
4a2499602804 refactorings to publish_edition.py
Atul Varma <varmaa@toolness.com>
parents: 13
diff changeset
185
19
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
186 urls = load(URLS_FILENAME, {})
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
187
14
4a2499602804 refactorings to publish_edition.py
Atul Varma <varmaa@toolness.com>
parents: 13
diff changeset
188 if update_urls:
19
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
189 refresh_urls(feeds=feeds, urls=urls)
28
6e2038000082 added dry-run cmdline option
Atul Varma <varmaa@toolness.com>
parents: 27
diff changeset
190 if not dry_run:
6e2038000082 added dry-run cmdline option
Atul Varma <varmaa@toolness.com>
parents: 27
diff changeset
191 save(urls, URLS_FILENAME)
14
4a2499602804 refactorings to publish_edition.py
Atul Varma <varmaa@toolness.com>
parents: 13
diff changeset
192
4a2499602804 refactorings to publish_edition.py
Atul Varma <varmaa@toolness.com>
parents: 13
diff changeset
193 articles = load(ARTICLES_FILENAME, {})
13
69fd13a4aef4 Added an html viewer and a publish_edition.py script that generates the JSON which the html viewer reads.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
194
14
4a2499602804 refactorings to publish_edition.py
Atul Varma <varmaa@toolness.com>
parents: 13
diff changeset
195 if update_articles:
19
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
196 refresh_articles(articles=articles,
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
197 feeds=feeds,
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
198 urls=urls)
28
6e2038000082 added dry-run cmdline option
Atul Varma <varmaa@toolness.com>
parents: 27
diff changeset
199 if not dry_run:
6e2038000082 added dry-run cmdline option
Atul Varma <varmaa@toolness.com>
parents: 27
diff changeset
200 save(articles, ARTICLES_FILENAME)
13
69fd13a4aef4 Added an html viewer and a publish_edition.py script that generates the JSON which the html viewer reads.
Atul Varma <varmaa@toolness.com>
parents:
diff changeset
201
22
f2fafca30ff3 added issue tracking information, though it's not yet being used in any way.
Atul Varma <varmaa@toolness.com>
parents: 21
diff changeset
202 issues = load(ISSUES_FILENAME, {'urls': {},
f2fafca30ff3 added issue tracking information, though it's not yet being used in any way.
Atul Varma <varmaa@toolness.com>
parents: 21
diff changeset
203 'pub_dates': []})
f2fafca30ff3 added issue tracking information, though it's not yet being used in any way.
Atul Varma <varmaa@toolness.com>
parents: 21
diff changeset
204
19
7fe2efecc9c3 more refactoring
Atul Varma <varmaa@toolness.com>
parents: 17
diff changeset
205 filtered_articles = filter_articles(names=names,
22
f2fafca30ff3 added issue tracking information, though it's not yet being used in any way.
Atul Varma <varmaa@toolness.com>
parents: 21
diff changeset
206 articles=articles,
f2fafca30ff3 added issue tracking information, though it's not yet being used in any way.
Atul Varma <varmaa@toolness.com>
parents: 21
diff changeset
207 issues=issues)
f2fafca30ff3 added issue tracking information, though it's not yet being used in any way.
Atul Varma <varmaa@toolness.com>
parents: 21
diff changeset
208
f2fafca30ff3 added issue tracking information, though it's not yet being used in any way.
Atul Varma <varmaa@toolness.com>
parents: 21
diff changeset
209 issue_id = len(issues['pub_dates'])
f2fafca30ff3 added issue tracking information, though it's not yet being used in any way.
Atul Varma <varmaa@toolness.com>
parents: 21
diff changeset
210 issues['pub_dates'].append(datetime.now())
f2fafca30ff3 added issue tracking information, though it's not yet being used in any way.
Atul Varma <varmaa@toolness.com>
parents: 21
diff changeset
211 for author in filtered_articles:
f2fafca30ff3 added issue tracking information, though it's not yet being used in any way.
Atul Varma <varmaa@toolness.com>
parents: 21
diff changeset
212 for article in filtered_articles[author]:
f2fafca30ff3 added issue tracking information, though it's not yet being used in any way.
Atul Varma <varmaa@toolness.com>
parents: 21
diff changeset
213 issues['urls'][article['url']] = issue_id
28
6e2038000082 added dry-run cmdline option
Atul Varma <varmaa@toolness.com>
parents: 27
diff changeset
214 if not dry_run:
6e2038000082 added dry-run cmdline option
Atul Varma <varmaa@toolness.com>
parents: 27
diff changeset
215 save(issues, ISSUES_FILENAME)
14
4a2499602804 refactorings to publish_edition.py
Atul Varma <varmaa@toolness.com>
parents: 13
diff changeset
216
28
6e2038000082 added dry-run cmdline option
Atul Varma <varmaa@toolness.com>
parents: 27
diff changeset
217 if not dry_run:
6e2038000082 added dry-run cmdline option
Atul Varma <varmaa@toolness.com>
parents: 27
diff changeset
218 json.dump({'authors': names, 'articles': filtered_articles},
6e2038000082 added dry-run cmdline option
Atul Varma <varmaa@toolness.com>
parents: 27
diff changeset
219 open(JSON_FILENAME, 'w'))
14
4a2499602804 refactorings to publish_edition.py
Atul Varma <varmaa@toolness.com>
parents: 13
diff changeset
220
28
6e2038000082 added dry-run cmdline option
Atul Varma <varmaa@toolness.com>
parents: 27
diff changeset
221 logging.info('wrote %s (issue #%d).' % (JSON_FILENAME, issue_id))
14
4a2499602804 refactorings to publish_edition.py
Atul Varma <varmaa@toolness.com>
parents: 13
diff changeset
222
15
f0dd39b7cbe1 added cmdline options
Atul Varma <varmaa@toolness.com>
parents: 14
diff changeset
223 parser_options = {
23
8b501dfe7d85 changed names of cmd line options to make a bit more sense
Atul Varma <varmaa@toolness.com>
parents: 22
diff changeset
224 ('-w', '--refresh-whoisi',):
15
f0dd39b7cbe1 added cmdline options
Atul Varma <varmaa@toolness.com>
parents: 14
diff changeset
225 dict(dest='update_whoisi',
f0dd39b7cbe1 added cmdline options
Atul Varma <varmaa@toolness.com>
parents: 14
diff changeset
226 help='re-sync with whoisi.com',
f0dd39b7cbe1 added cmdline options
Atul Varma <varmaa@toolness.com>
parents: 14
diff changeset
227 action='store_true',
f0dd39b7cbe1 added cmdline options
Atul Varma <varmaa@toolness.com>
parents: 14
diff changeset
228 default=False),
f0dd39b7cbe1 added cmdline options
Atul Varma <varmaa@toolness.com>
parents: 14
diff changeset
229
23
8b501dfe7d85 changed names of cmd line options to make a bit more sense
Atul Varma <varmaa@toolness.com>
parents: 22
diff changeset
230 ('-f', '--refresh-feeds',):
15
f0dd39b7cbe1 added cmdline options
Atul Varma <varmaa@toolness.com>
parents: 14
diff changeset
231 dict(dest='update_urls',
f0dd39b7cbe1 added cmdline options
Atul Varma <varmaa@toolness.com>
parents: 14
diff changeset
232 help='refresh feeds',
f0dd39b7cbe1 added cmdline options
Atul Varma <varmaa@toolness.com>
parents: 14
diff changeset
233 action='store_true',
f0dd39b7cbe1 added cmdline options
Atul Varma <varmaa@toolness.com>
parents: 14
diff changeset
234 default=False),
f0dd39b7cbe1 added cmdline options
Atul Varma <varmaa@toolness.com>
parents: 14
diff changeset
235
23
8b501dfe7d85 changed names of cmd line options to make a bit more sense
Atul Varma <varmaa@toolness.com>
parents: 22
diff changeset
236 ('-p', '--reparse-feeds',):
15
f0dd39b7cbe1 added cmdline options
Atul Varma <varmaa@toolness.com>
parents: 14
diff changeset
237 dict(dest='update_articles',
23
8b501dfe7d85 changed names of cmd line options to make a bit more sense
Atul Varma <varmaa@toolness.com>
parents: 22
diff changeset
238 help='re-parse feeds',
15
f0dd39b7cbe1 added cmdline options
Atul Varma <varmaa@toolness.com>
parents: 14
diff changeset
239 action='store_true',
f0dd39b7cbe1 added cmdline options
Atul Varma <varmaa@toolness.com>
parents: 14
diff changeset
240 default=False),
28
6e2038000082 added dry-run cmdline option
Atul Varma <varmaa@toolness.com>
parents: 27
diff changeset
241
6e2038000082 added dry-run cmdline option
Atul Varma <varmaa@toolness.com>
parents: 27
diff changeset
242 ('-d', '--dry-run',):
6e2038000082 added dry-run cmdline option
Atul Varma <varmaa@toolness.com>
parents: 27
diff changeset
243 dict(dest='dry_run',
6e2038000082 added dry-run cmdline option
Atul Varma <varmaa@toolness.com>
parents: 27
diff changeset
244 help='do not write anything to disk',
6e2038000082 added dry-run cmdline option
Atul Varma <varmaa@toolness.com>
parents: 27
diff changeset
245 action='store_true',
6e2038000082 added dry-run cmdline option
Atul Varma <varmaa@toolness.com>
parents: 27
diff changeset
246 default=False),
15
f0dd39b7cbe1 added cmdline options
Atul Varma <varmaa@toolness.com>
parents: 14
diff changeset
247 }
f0dd39b7cbe1 added cmdline options
Atul Varma <varmaa@toolness.com>
parents: 14
diff changeset
248
14
4a2499602804 refactorings to publish_edition.py
Atul Varma <varmaa@toolness.com>
parents: 13
diff changeset
249 if __name__ == '__main__':
4a2499602804 refactorings to publish_edition.py
Atul Varma <varmaa@toolness.com>
parents: 13
diff changeset
250 logging.basicConfig(level=logging.DEBUG)
15
f0dd39b7cbe1 added cmdline options
Atul Varma <varmaa@toolness.com>
parents: 14
diff changeset
251
f0dd39b7cbe1 added cmdline options
Atul Varma <varmaa@toolness.com>
parents: 14
diff changeset
252 parser = optparse.OptionParser()
f0dd39b7cbe1 added cmdline options
Atul Varma <varmaa@toolness.com>
parents: 14
diff changeset
253 for names, opts in parser_options.items():
f0dd39b7cbe1 added cmdline options
Atul Varma <varmaa@toolness.com>
parents: 14
diff changeset
254 parser.add_option(*names, **opts)
f0dd39b7cbe1 added cmdline options
Atul Varma <varmaa@toolness.com>
parents: 14
diff changeset
255 (options, args) = parser.parse_args()
f0dd39b7cbe1 added cmdline options
Atul Varma <varmaa@toolness.com>
parents: 14
diff changeset
256
f0dd39b7cbe1 added cmdline options
Atul Varma <varmaa@toolness.com>
parents: 14
diff changeset
257 publish_edition(**options.__dict__)